6503547 deadlock between utilities accessing /proc and processes using nfs

author: Deepak Honnalli <Deepak.Honnalli@Sun.COM> 2008-09-25 11:52:01 +0530
committer: Deepak Honnalli <Deepak.Honnalli@Sun.COM> 2008-09-25 11:52:01 +0530
commit: 1384c58687fc765ca3d9a998204826d3dd0ce419 (patch)
tree: 7ffd53cb62bc335ecb7309087790a804fbc0a123
parent: aecfc01d1bad84e66649703f7fc2926ef70b34ba (diff)
download: illumos-gate-1384c58687fc765ca3d9a998204826d3dd0ce419.tar.gz
5 files changed, 101 insertions, 38 deletions
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index e438bc301c..84620044a7 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -573,7 +573,8 @@ nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 	 */
 	if ((vp->v_flag & VNOCACHE) ||
 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
-	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+	    !vn_has_cached_data(vp))) {
 		return (nfs3_directio_read(vp, uiop, cr));
 	}
 
@@ -733,7 +734,8 @@ nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 	 */
 	if ((vp->v_flag & VNOCACHE) ||
 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
-	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+	    !vn_has_cached_data(vp))) {
 		size_t bufsize;
 		int count;
 		u_offset_t org_offset;
@@ -5207,8 +5209,29 @@ nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
 	 */
 	rp = VTOR(vp);
-	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
+
+	/*
+	 * Atomically increment r_inmap after acquiring r_rwlock. The
+	 * idea here is to acquire r_rwlock to block read/write and
+	 * not to protect r_inmap. r_inmap will inform nfs3_read/write()
+	 * that we are in nfs3_map(). Now, r_rwlock is acquired in order
+	 * and we can prevent the deadlock that would have occurred
+	 * when nfs3_addmap() would have acquired it out of order.
+	 *
+	 * Since we are not protecting r_inmap by any lock, we do not
+	 * hold any lock when we decrement it. We atomically decrement
+	 * r_inmap after we release r_lkserlock.
+	 */
+
+	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
+		return (EINTR);
+	atomic_add_int(&rp->r_inmap, 1);
+	nfs_rw_exit(&rp->r_rwlock);
+
+	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
+		atomic_add_int(&rp->r_inmap, -1);
 		return (EINTR);
+	}
 
 	if (vp->v_flag & VNOCACHE) {
 		error = EAGAIN;
@@ -5248,6 +5271,7 @@ nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 
 done:
 	nfs_rw_exit(&rp->r_lkserlock);
+	atomic_add_int(&rp->r_inmap, -1);
 	return (error);
 }
 
@@ -5264,16 +5288,8 @@ nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	if (nfs_zone() != VTOMI(vp)->mi_zone)
 		return (EIO);
 
-	/*
-	 * Need to hold rwlock while incrementing the mapcnt so that
-	 * mmap'ing can be serialized with writes so that the caching
-	 * can be handled correctly.
-	 */
 	rp = VTOR(vp);
-	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
-		return (EINTR);
 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
-	nfs_rw_exit(&rp->r_rwlock);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index ed49f63422..4caafc6049 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -2627,7 +2627,7 @@ nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 	 */
 	if ((vp->v_flag & VNOCACHE) ||
 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
-	    rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) {
+	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
 		size_t resid = 0;
 
 		return (nfs4read(vp, NULL, uiop->uio_loffset,
@@ -2813,7 +2813,7 @@ nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 	 */
 	if ((vp->v_flag & VNOCACHE) ||
 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
-	    rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) {
+	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
 		size_t bufsize;
 		int count;
 		u_offset_t org_offset;
@@ -10376,9 +10376,29 @@ nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 	 * This means portions of the file are locked (through VOP_FRLOCK).
 	 * In this case the map request must be refused.  We use
 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
+	 *
+	 * Atomically increment r_inmap after acquiring r_rwlock. The
+	 * idea here is to acquire r_rwlock to block read/write and
+	 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
+	 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
+	 * and we can prevent the deadlock that would have occurred
+	 * when nfs4_addmap() would have acquired it out of order.
+	 *
+	 * Since we are not protecting r_inmap by any lock, we do not
+	 * hold any lock when we decrement it. We atomically decrement
+	 * r_inmap after we release r_lkserlock.
 	 */
-	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
+
+	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
+		return (EINTR);
+	atomic_add_int(&rp->r_inmap, 1);
+	nfs_rw_exit(&rp->r_rwlock);
+
+	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
+		atomic_add_int(&rp->r_inmap, -1);
 		return (EINTR);
+	}
+
 
 	if (vp->v_flag & VNOCACHE) {
 		error = EAGAIN;
@@ -10483,6 +10503,7 @@ nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 
 done:
 	nfs_rw_exit(&rp->r_lkserlock);
+	atomic_add_int(&rp->r_inmap, -1);
 	return (error);
 }
 
@@ -10622,20 +10643,13 @@ nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 		return (ENOSYS);
 
 	/*
-	 * Need to hold rwlock while incrementing the mapcnt so that
-	 * mmap'ing can be serialized with writes so that the caching
-	 * can be handled correctly.
-	 *
 	 * Don't need to update the open stream first, since this
 	 * mmap can't add any additional share access that isn't
 	 * already contained in the open stream (for the case where we
 	 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
 	 * take into account os_mmap_read[write] counts).
 	 */
-	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
-		return (EINTR);
 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
-	nfs_rw_exit(&rp->r_rwlock);
 
 	if (vp->v_type == VREG) {
 		/*
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 8da7d8f06f..73a619c238 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -448,7 +448,8 @@ nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 	 */
 	if ((vp->v_flag & VNOCACHE) ||
 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
-	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+	    !vn_has_cached_data(vp))) {
 		size_t bufsize;
 		size_t resid = 0;
 
@@ -634,7 +635,8 @@ nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 	 */
 	if ((vp->v_flag & VNOCACHE) ||
 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
-	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+	    !vn_has_cached_data(vp))) {
 		size_t bufsize;
 		int count;
 		uint_t org_offset;
@@ -4295,9 +4297,29 @@ nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
 	 */
 	rp = VTOR(vp);
-	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
+
+	/*
+	 * Atomically increment r_inmap after acquiring r_rwlock. The
+	 * idea here is to acquire r_rwlock to block read/write and
+	 * not to protect r_inmap. r_inmap will inform nfs_read/write()
+	 * that we are in nfs_map(). Now, r_rwlock is acquired in order
+	 * and we can prevent the deadlock that would have occurred
+	 * when nfs_addmap() would have acquired it out of order.
+	 *
+	 * Since we are not protecting r_inmap by any lock, we do not
+	 * hold any lock when we decrement it. We atomically decrement
+	 * r_inmap after we release r_lkserlock.
+	 */
+
+	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
 		return (EINTR);
+	atomic_add_int(&rp->r_inmap, 1);
+	nfs_rw_exit(&rp->r_rwlock);
 
+	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
+		atomic_add_int(&rp->r_inmap, -1);
+		return (EINTR);
+	}
 	if (vp->v_flag & VNOCACHE) {
 		error = EAGAIN;
 		goto done;
@@ -4336,6 +4358,7 @@ nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 
 done:
 	nfs_rw_exit(&rp->r_lkserlock);
+	atomic_add_int(&rp->r_inmap, -1);
 	return (error);
 }
 
@@ -4352,16 +4375,8 @@ nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	if (nfs_zone() != VTOMI(vp)->mi_zone)
 		return (EIO);
 
-	/*
-	 * Need to hold rwlock while incrementing the mapcnt so that
-	 * mmap'ing can be serialized with writes so that the caching
-	 * can be handled correctly.
-	 */
 	rp = VTOR(vp);
-	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
-		return (EINTR);
 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
-	nfs_rw_exit(&rp->r_rwlock);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/nfs/rnode.h b/usr/src/uts/common/nfs/rnode.h
index f2a956040c..3df90d8ffb 100644
--- a/usr/src/uts/common/nfs/rnode.h
+++ b/usr/src/uts/common/nfs/rnode.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,8 +29,6 @@
 #ifndef	_NFS_RNODE_H
 #define	_NFS_RNODE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/avl.h>
 #include <sys/list.h>
 #include <nfs/nfs.h>
@@ -221,6 +219,16 @@ typedef struct rhashq {
  *	set r_modaddr and release r_statelock as long as the r_rwlock
  *	writer lock is held.
  *
+ * r_inmap informs nfsX_read()/write() that there is a call to nfsX_map()
+ * in progress. nfsX_read()/write() check r_inmap to decide whether
+ * to perform directio on the file or not. r_inmap is atomically
+ * incremented in nfsX_map() before the address space routines are
+ * called and atomically decremented just before nfsX_map() exits.
+ * r_inmap is not protected by any lock.
+ *
+ * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0
+ * while the rnode has mapped pages.
+ *
  * 64-bit offsets: the code formerly assumed that atomic reads of
  * r_size were safe and reliable; on 32-bit architectures, this is
  * not true since an intervening bus cycle from another processor
@@ -283,6 +291,7 @@ typedef struct rnode {
 	acache_t	*r_acache;	/* list of access cache entries */
 	kthread_t	*r_serial;	/* id of purging thread */
 	list_t		r_indelmap;	/* list of delmap callers */
+	uint_t		r_inmap;	/* to serialize read/write and mmap */
 } rnode_t;
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/nfs/rnode4.h b/usr/src/uts/common/nfs/rnode4.h
index 7827e24679..7f4c705698 100644
--- a/usr/src/uts/common/nfs/rnode4.h
+++ b/usr/src/uts/common/nfs/rnode4.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- *	Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ *	Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  *	Use is subject to license terms.
  */
 
@@ -29,8 +29,6 @@
 #ifndef	_NFS_RNODE4_H
 #define	_NFS_RNODE4_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -200,6 +198,16 @@ typedef struct r4hashq {
  *	set r_modaddr and release r_statelock as long as the r_rwlock
  *	writer lock is held.
  *
+ * r_inmap informs nfs4_read()/write() that there is a call to nfs4_map()
+ * in progress. nfs4_read()/write() check r_inmap to decide whether
+ * to perform directio on the file or not. r_inmap is atomically
+ * incremented in nfs4_map() before the address space routines are
+ * called and atomically decremented just before nfs4_map() exits.
+ * r_inmap is not protected by any lock.
+ *
+ * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0
+ * while the rnode has mapped pages.
+ *
  * 64-bit offsets: the code formerly assumed that atomic reads of
  * r_size were safe and reliable; on 32-bit architectures, this is
  * not true since an intervening bus cycle from another processor
@@ -326,6 +334,7 @@ typedef struct rnode4 {
 					/* stub type was set		    */
 	nfs4_stub_type_t	r_stub_type;
 					/* e.g. mirror-mount */
+	uint_t		r_inmap;	/* to serialize read/write and mmap */
 } rnode4_t;
 
 #define	r_vnode	r_svnode.sv_r_vnode
author	Deepak Honnalli <Deepak.Honnalli@Sun.COM>	2008-09-25 11:52:01 +0530
committer	Deepak Honnalli <Deepak.Honnalli@Sun.COM>	2008-09-25 11:52:01 +0530
commit	1384c58687fc765ca3d9a998204826d3dd0ce419 (patch)
tree	7ffd53cb62bc335ecb7309087790a804fbc0a123
parent	aecfc01d1bad84e66649703f7fc2926ef70b34ba (diff)
download	illumos-gate-1384c58687fc765ca3d9a998204826d3dd0ce419.tar.gz