diff options
author | Deepak Honnalli <Deepak.Honnalli@Sun.COM> | 2008-09-25 11:52:01 +0530 |
---|---|---|
committer | Deepak Honnalli <Deepak.Honnalli@Sun.COM> | 2008-09-25 11:52:01 +0530 |
commit | 1384c58687fc765ca3d9a998204826d3dd0ce419 (patch) | |
tree | 7ffd53cb62bc335ecb7309087790a804fbc0a123 | |
parent | aecfc01d1bad84e66649703f7fc2926ef70b34ba (diff) | |
download | illumos-gate-1384c58687fc765ca3d9a998204826d3dd0ce419.tar.gz |
6503547 deadlock between utilities accessing /proc and processes using nfs
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs3_vnops.c | 38 | ||||
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs4_vnops.c | 34 | ||||
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs_vnops.c | 37 | ||||
-rw-r--r-- | usr/src/uts/common/nfs/rnode.h | 15 | ||||
-rw-r--r-- | usr/src/uts/common/nfs/rnode4.h | 15 |
5 files changed, 101 insertions, 38 deletions
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index e438bc301c..84620044a7 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -573,7 +573,8 @@ nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, */ if ((vp->v_flag & VNOCACHE) || (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && - rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { + rp->r_mapcnt == 0 && rp->r_inmap == 0 && + !vn_has_cached_data(vp))) { return (nfs3_directio_read(vp, uiop, cr)); } @@ -733,7 +734,8 @@ nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, */ if ((vp->v_flag & VNOCACHE) || (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && - rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { + rp->r_mapcnt == 0 && rp->r_inmap == 0 && + !vn_has_cached_data(vp))) { size_t bufsize; int count; u_offset_t org_offset; @@ -5207,8 +5209,29 @@ nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, * rp->r_lkserlock to avoid a race with concurrent lock requests. */ rp = VTOR(vp); - if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) + + /* + * Atomically increment r_inmap after acquiring r_rwlock. The + * idea here is to acquire r_rwlock to block read/write and + * not to protect r_inmap. r_inmap will inform nfs3_read/write() + * that we are in nfs3_map(). Now, r_rwlock is acquired in order + * and we can prevent the deadlock that would have occurred + * when nfs3_addmap() would have acquired it out of order. + * + * Since we are not protecting r_inmap by any lock, we do not + * hold any lock when we decrement it. We atomically decrement + * r_inmap after we release r_lkserlock. + */ + + if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) + return (EINTR); + atomic_add_int(&rp->r_inmap, 1); + nfs_rw_exit(&rp->r_rwlock); + + if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) { + atomic_add_int(&rp->r_inmap, -1); return (EINTR); + } if (vp->v_flag & VNOCACHE) { error = EAGAIN; @@ -5248,6 +5271,7 @@ nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, done: nfs_rw_exit(&rp->r_lkserlock); + atomic_add_int(&rp->r_inmap, -1); return (error); } @@ -5264,16 +5288,8 @@ nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, if (nfs_zone() != VTOMI(vp)->mi_zone) return (EIO); - /* - * Need to hold rwlock while incrementing the mapcnt so that - * mmap'ing can be serialized with writes so that the caching - * can be handled correctly. - */ rp = VTOR(vp); - if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) - return (EINTR); atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); - nfs_rw_exit(&rp->r_rwlock); return (0); } diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index ed49f63422..4caafc6049 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -2627,7 +2627,7 @@ nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, */ if ((vp->v_flag & VNOCACHE) || (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && - rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { + rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { size_t resid = 0; return (nfs4read(vp, NULL, uiop->uio_loffset, @@ -2813,7 +2813,7 @@ nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, */ if ((vp->v_flag & VNOCACHE) || (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && - rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { + rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { size_t bufsize; int count; u_offset_t org_offset; @@ -10376,9 +10376,29 @@ nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, * This means portions of the file are locked (through VOP_FRLOCK). * In this case the map request must be refused. We use * rp->r_lkserlock to avoid a race with concurrent lock requests. + * + * Atomically increment r_inmap after acquiring r_rwlock. The + * idea here is to acquire r_rwlock to block read/write and + * not to protect r_inmap. r_inmap will inform nfs4_read/write() + * that we are in nfs4_map(). Now, r_rwlock is acquired in order + * and we can prevent the deadlock that would have occurred + * when nfs4_addmap() would have acquired it out of order. + * + * Since we are not protecting r_inmap by any lock, we do not + * hold any lock when we decrement it. We atomically decrement + * r_inmap after we release r_lkserlock. */ - if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) + + if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) + return (EINTR); + atomic_add_int(&rp->r_inmap, 1); + nfs_rw_exit(&rp->r_rwlock); + + if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { + atomic_add_int(&rp->r_inmap, -1); return (EINTR); + } + if (vp->v_flag & VNOCACHE) { error = EAGAIN; @@ -10483,6 +10503,7 @@ nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, done: nfs_rw_exit(&rp->r_lkserlock); + atomic_add_int(&rp->r_inmap, -1); return (error); } @@ -10622,20 +10643,13 @@ nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, return (ENOSYS); /* - * Need to hold rwlock while incrementing the mapcnt so that - * mmap'ing can be serialized with writes so that the caching - * can be handled correctly. - * * Don't need to update the open stream first, since this * mmap can't add any additional share access that isn't * already contained in the open stream (for the case where we * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't * take into account os_mmap_read[write] counts). */ - if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) - return (EINTR); atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); - nfs_rw_exit(&rp->r_rwlock); if (vp->v_type == VREG) { /* diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index 8da7d8f06f..73a619c238 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -448,7 +448,8 @@ nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, */ if ((vp->v_flag & VNOCACHE) || (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && - rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { + rp->r_mapcnt == 0 && rp->r_inmap == 0 && + !vn_has_cached_data(vp))) { size_t bufsize; size_t resid = 0; @@ -634,7 +635,8 @@ nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, */ if ((vp->v_flag & VNOCACHE) || (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && - rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) { + rp->r_mapcnt == 0 && rp->r_inmap == 0 && + !vn_has_cached_data(vp))) { size_t bufsize; int count; uint_t org_offset; @@ -4295,9 +4297,29 @@ nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, * rp->r_lkserlock to avoid a race with concurrent lock requests. */ rp = VTOR(vp); - if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) + + /* + * Atomically increment r_inmap after acquiring r_rwlock. The + * idea here is to acquire r_rwlock to block read/write and + * not to protect r_inmap. r_inmap will inform nfs_read/write() + * that we are in nfs_map(). Now, r_rwlock is acquired in order + * and we can prevent the deadlock that would have occurred + * when nfs_addmap() would have acquired it out of order. + * + * Since we are not protecting r_inmap by any lock, we do not + * hold any lock when we decrement it. We atomically decrement + * r_inmap after we release r_lkserlock. + */ + + if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) return (EINTR); + atomic_add_int(&rp->r_inmap, 1); + nfs_rw_exit(&rp->r_rwlock); + if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) { + atomic_add_int(&rp->r_inmap, -1); + return (EINTR); + } if (vp->v_flag & VNOCACHE) { error = EAGAIN; goto done; @@ -4336,6 +4358,7 @@ nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, done: nfs_rw_exit(&rp->r_lkserlock); + atomic_add_int(&rp->r_inmap, -1); return (error); } @@ -4352,16 +4375,8 @@ nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, if (nfs_zone() != VTOMI(vp)->mi_zone) return (EIO); - /* - * Need to hold rwlock while incrementing the mapcnt so that - * mmap'ing can be serialized with writes so that the caching - * can be handled correctly. - */ rp = VTOR(vp); - if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) - return (EINTR); atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); - nfs_rw_exit(&rp->r_rwlock); return (0); } diff --git a/usr/src/uts/common/nfs/rnode.h b/usr/src/uts/common/nfs/rnode.h index f2a956040c..3df90d8ffb 100644 --- a/usr/src/uts/common/nfs/rnode.h +++ b/usr/src/uts/common/nfs/rnode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,8 +29,6 @@ #ifndef _NFS_RNODE_H #define _NFS_RNODE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/avl.h> #include <sys/list.h> #include <nfs/nfs.h> @@ -221,6 +219,16 @@ typedef struct rhashq { * set r_modaddr and release r_statelock as long as the r_rwlock * writer lock is held. * + * r_inmap informs nfsX_read()/write() that there is a call to nfsX_map() + * in progress. nfsX_read()/write() check r_inmap to decide whether + * to perform directio on the file or not. r_inmap is atomically + * incremented in nfsX_map() before the address space routines are + * called and atomically decremented just before nfsX_map() exits. + * r_inmap is not protected by any lock. + * + * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0 + * while the rnode has mapped pages. + * * 64-bit offsets: the code formerly assumed that atomic reads of * r_size were safe and reliable; on 32-bit architectures, this is * not true since an intervening bus cycle from another processor @@ -283,6 +291,7 @@ typedef struct rnode { acache_t *r_acache; /* list of access cache entries */ kthread_t *r_serial; /* id of purging thread */ list_t r_indelmap; /* list of delmap callers */ + uint_t r_inmap; /* to serialize read/write and mmap */ } rnode_t; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/nfs/rnode4.h b/usr/src/uts/common/nfs/rnode4.h index 7827e24679..7f4c705698 100644 --- a/usr/src/uts/common/nfs/rnode4.h +++ b/usr/src/uts/common/nfs/rnode4.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,8 +29,6 @@ #ifndef _NFS_RNODE4_H #define _NFS_RNODE4_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -200,6 +198,16 @@ typedef struct r4hashq { * set r_modaddr and release r_statelock as long as the r_rwlock * writer lock is held. * + * r_inmap informs nfs4_read()/write() that there is a call to nfs4_map() + * in progress. nfs4_read()/write() check r_inmap to decide whether + * to perform directio on the file or not. r_inmap is atomically + * incremented in nfs4_map() before the address space routines are + * called and atomically decremented just before nfs4_map() exits. + * r_inmap is not protected by any lock. + * + * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0 + * while the rnode has mapped pages. + * * 64-bit offsets: the code formerly assumed that atomic reads of * r_size were safe and reliable; on 32-bit architectures, this is * not true since an intervening bus cycle from another processor @@ -326,6 +334,7 @@ typedef struct rnode4 { /* stub type was set */ nfs4_stub_type_t r_stub_type; /* e.g. mirror-mount */ + uint_t r_inmap; /* to serialize read/write and mmap */ } rnode4_t; #define r_vnode r_svnode.sv_r_vnode |