diff options
Diffstat (limited to 'usr/src/uts/common/os/flock.c')
-rw-r--r-- | usr/src/uts/common/os/flock.c | 326 |
1 files changed, 322 insertions, 4 deletions
diff --git a/usr/src/uts/common/os/flock.c b/usr/src/uts/common/os/flock.c index 5dad4abb61..a54d6028d5 100644 --- a/usr/src/uts/common/os/flock.c +++ b/usr/src/uts/common/os/flock.c @@ -29,6 +29,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/flock_impl.h> @@ -243,9 +244,293 @@ flk_get_lockmgr_status(void) } /* - * Routine called from fs_frlock in fs/fs_subr.c + * This implements Open File Description (not descriptor) style record locking. + * These locks can also be thought of as pid-less since they are not tied to a + * specific process, thus they're preserved across fork. + * + * Called directly from fcntl. + * + * See reclock() for the implementation of the traditional POSIX style record + * locking scheme (pid-ful). This function is derived from reclock() but + * simplified and modified to work for OFD style locking. + * + * The two primary advantages of OFD style of locking are: + * 1) It is per-file description, so closing a file descriptor that refers to a + * different file description for the same file will not drop the lock (i.e. + * two open's of the same file get different descriptions but a dup or fork + * will refer to the same description). + * 2) Locks are preserved across fork(2). + * + * Because these locks are per-description a lock ptr lives at the f_filocks + * member of the file_t and the lock_descriptor includes a file_t pointer + * to enable unique lock identification and management. + * + * Since these locks are pid-less we cannot do deadlock detection with the + * current process-oriented implementation. This is consistent with OFD locking + * behavior on other operating systems such as Linux. Since we don't do + * deadlock detection we never interact with the process graph that is + * maintained for deadlock detection on the traditional POSIX-style locks. + * + * Future Work: + * + * The current implementation does not support record locks. That is, + * currently the single lock must cover the entire file. This is validated in + * fcntl. To support record locks the f_filock pointer in the file_t needs to + * be changed to a list of pointers to the locks. That list needs to be + * managed independently of the lock list on the vnode itself and it needs to + * be maintained as record locks are created, split, coalesced and deleted. + * + * The current implementation does not support remote file systems (e.g. + * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks + * interact with the NLM is not clear since the NLM protocol/implementation + * appears to be oriented around locks associated with a process. A further + * problem is that a design is needed for what nlm_send_siglost() should do and + * where it will send SIGLOST. More recent versions of Linux apparently try to + * emulate OFD locks on NFS by converting them to traditional POSIX style locks + * that work with the NLM. It is not clear that this provides the correct + * semantics in all cases. */ +int +ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset) +{ + int cmd = 0; + vnode_t *vp; + lock_descriptor_t stack_lock_request; + lock_descriptor_t *lock_request; + int error = 0; + graph_t *gp; + int serialize = 0; + + if (fcmd != F_OFD_GETLK) + cmd = SETFLCK; + + if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW) + cmd |= SLPFLCK; + + /* see block comment */ + VERIFY(lckdat->l_whence == 0); + VERIFY(lckdat->l_start == 0); + VERIFY(lckdat->l_len == 0); + + vp = fp->f_vnode; + + /* + * For reclock fs_frlock() would normally have set these in a few + * places but for us it's cleaner to centralize it here. Note that + * IGN_PID is -1. We use 0 for our pid-less locks. + */ + lckdat->l_pid = 0; + lckdat->l_sysid = 0; + + /* + * Check access permissions + */ + if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) && + ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) || + (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0))) + return (EBADF); + + /* + * for query and unlock we use the stack_lock_request + */ + if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) { + lock_request = &stack_lock_request; + (void) bzero((caddr_t)lock_request, + sizeof (lock_descriptor_t)); + + /* + * following is added to make the assertions in + * flk_execute_request() pass + */ + lock_request->l_edge.edge_in_next = &lock_request->l_edge; + lock_request->l_edge.edge_in_prev = &lock_request->l_edge; + lock_request->l_edge.edge_adj_next = &lock_request->l_edge; + lock_request->l_edge.edge_adj_prev = &lock_request->l_edge; + lock_request->l_status = FLK_INITIAL_STATE; + } else { + lock_request = flk_get_lock(); + fp->f_filock = (struct filock *)lock_request; + } + lock_request->l_state = 0; + lock_request->l_vnode = vp; + lock_request->l_zoneid = getzoneid(); + lock_request->l_ofd = fp; + + /* + * Convert the request range into the canonical start and end + * values then check the validity of the lock range. + */ + error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start, + &lock_request->l_end, offset); + if (error) + goto done; + + error = flk_check_lock_data(lock_request->l_start, lock_request->l_end, + MAXEND); + if (error) + goto done; + + ASSERT(lock_request->l_end >= lock_request->l_start); + + lock_request->l_type = lckdat->l_type; + if (cmd & SLPFLCK) + lock_request->l_state |= WILLING_TO_SLEEP_LOCK; + + if (!(cmd & SETFLCK)) { + if (lock_request->l_type == F_RDLCK || + lock_request->l_type == F_WRLCK) + lock_request->l_state |= QUERY_LOCK; + } + lock_request->l_flock = (*lckdat); + + /* + * We are ready for processing the request + */ + + if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK && + nbl_need_check(vp)) { + nbl_start_crit(vp, RW_WRITER); + serialize = 1; + } + /* Get the lock graph for a particular vnode */ + gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH); + + mutex_enter(&gp->gp_mutex); + + lock_request->l_state |= REFERENCED_LOCK; + lock_request->l_graph = gp; + + switch (lock_request->l_type) { + case F_RDLCK: + case F_WRLCK: + if (IS_QUERY_LOCK(lock_request)) { + flk_get_first_blocking_lock(lock_request); + if (lock_request->l_ofd != NULL) + lock_request->l_flock.l_pid = -1; + (*lckdat) = lock_request->l_flock; + } else { + /* process the request now */ + error = flk_process_request(lock_request); + } + break; + + case F_UNLCK: + /* unlock request will not block so execute it immediately */ + error = flk_execute_request(lock_request); + break; + + default: + error = EINVAL; + break; + } + + if (lock_request == &stack_lock_request) { + flk_set_state(lock_request, FLK_DEAD_STATE); + } else { + lock_request->l_state &= ~REFERENCED_LOCK; + if ((error != 0) || IS_DELETED(lock_request)) { + flk_set_state(lock_request, FLK_DEAD_STATE); + flk_free_lock(lock_request); + } + } + + mutex_exit(&gp->gp_mutex); + if (serialize) + nbl_end_crit(vp); + + return (error); + +done: + flk_set_state(lock_request, FLK_DEAD_STATE); + if (lock_request != &stack_lock_request) + flk_free_lock(lock_request); + return (error); +} + +/* + * Remove any lock on the vnode belonging to the given file_t. + * Called from closef on last close, file_t is locked. + * + * This is modeled on the cleanlocks() function but only removes the single + * lock associated with fp. + */ +void +ofdcleanlock(file_t *fp) +{ + lock_descriptor_t *fplock, *lock, *nlock; + vnode_t *vp; + graph_t *gp; + + ASSERT(MUTEX_HELD(&fp->f_tlock)); + + if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL) + return; + + fp->f_filock = NULL; + vp = fp->f_vnode; + + gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); + + if (gp == NULL) + return; + mutex_enter(&gp->gp_mutex); + + CHECK_SLEEPING_LOCKS(gp); + CHECK_ACTIVE_LOCKS(gp); + + SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); + + if (lock) { + do { + nlock = lock->l_next; + if (fplock == lock) { + CANCEL_WAKEUP(lock); + break; + } + lock = nlock; + } while (lock->l_vnode == vp); + } + + SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); + + if (lock) { + do { + nlock = lock->l_next; + if (fplock == lock) { + flk_delete_active_lock(lock, 0); + flk_wakeup(lock, 1); + flk_free_lock(lock); + break; + } + lock = nlock; + } while (lock->l_vnode == vp); + } + + CHECK_SLEEPING_LOCKS(gp); + CHECK_ACTIVE_LOCKS(gp); + mutex_exit(&gp->gp_mutex); +} + +/* + * Routine called from fs_frlock in fs/fs_subr.c + * + * This implements traditional POSIX style record locking. The two primary + * drawbacks to this style of locking are: + * 1) It is per-process, so any close of a file descriptor that refers to the + * file will drop the lock (e.g. lock /etc/passwd, call a library function + * which opens /etc/passwd to read the file, when the library closes it's + * file descriptor the application loses its lock and does not know). + * 2) Locks are not preserved across fork(2). + * + * Because these locks are only assoiciated with a pid they are per-process. + * This is why any close will drop the lock and is also why once the process + * forks then the lock is no long related to the new process. These locks can + * be considered as pid-ful. + * + * See ofdlock() for the implementation of a similar but improved locking + * scheme. + */ int reclock(vnode_t *vp, flock64_t *lckdat, @@ -424,6 +709,8 @@ reclock(vnode_t *vp, case F_WRLCK: if (IS_QUERY_LOCK(lock_request)) { flk_get_first_blocking_lock(lock_request); + if (lock_request->l_ofd != NULL) + lock_request->l_flock.l_pid = -1; (*lckdat) = lock_request->l_flock; break; } @@ -712,7 +999,13 @@ flk_get_lock(void) void flk_free_lock(lock_descriptor_t *lock) { + file_t *fp; + ASSERT(IS_DEAD(lock)); + + if ((fp = lock->l_ofd) != NULL) + fp->f_filock = NULL; + if (IS_REFERENCED(lock)) { lock->l_state |= DELETED_LOCK; return; @@ -1214,7 +1507,7 @@ flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock, from_lock->l_edge.edge_adj_next = edge; /* - * put in in list of to vertex + * put in list of to vertex */ to_lock->l_edge.edge_in_next->edge_in_prev = edge; @@ -2601,9 +2894,11 @@ flk_canceled(lock_descriptor_t *request) } /* - * Remove all the locks for the vnode belonging to the given pid and sysid. + * Remove all non-OFD locks for the vnode belonging to the given pid and sysid. + * That is, since OFD locks are pid-less we'll never match on the incoming + * pid. OFD locks are removed earlier in the close() path via closef() and + * ofdcleanlock(). */ - void cleanlocks(vnode_t *vp, pid_t pid, int sysid) { @@ -2770,6 +3065,14 @@ flk_check_deadlock(lock_descriptor_t *lock) edge_t *ep, *nep; proc_vertex_t *process_stack; + /* + * OFD style locks are not associated with any process so there is + * no proc graph for these. Thus we cannot, and do not, do deadlock + * detection. + */ + if (lock->l_ofd != NULL) + return (0); + STACK_INIT(process_stack); mutex_enter(&flock_lock); @@ -3062,6 +3365,16 @@ flk_update_proc_graph(edge_t *ep, int delete) proc_edge_t *pep, *prevpep; mutex_enter(&flock_lock); + + /* + * OFD style locks are not associated with any process so there is + * no proc graph for these. + */ + if (ep->from_vertex->l_ofd != NULL) { + mutex_exit(&flock_lock); + return; + } + toproc = flk_get_proc_vertex(ep->to_vertex); fromproc = flk_get_proc_vertex(ep->from_vertex); @@ -3891,6 +4204,7 @@ report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request) flrp->l_type = blocker->l_type; flrp->l_pid = blocker->l_flock.l_pid; flrp->l_sysid = blocker->l_flock.l_sysid; + request->l_ofd = blocker->l_ofd; if (IS_LOCKMGR(request)) { flrp->l_start = blocker->l_start; @@ -4206,6 +4520,10 @@ check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp) { lock_descriptor_t *lock; + /* Ignore OFD style locks since they're not process-wide. */ + if (pid == 0) + return; + SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); if (lock) { |