1 files changed, 322 insertions, 4 deletions
diff --git a/usr/src/uts/common/os/flock.c b/usr/src/uts/common/os/flock.c
index 5dad4abb61..a54d6028d5 100644
--- a/usr/src/uts/common/os/flock.c
+++ b/usr/src/uts/common/os/flock.c
@@ -29,6 +29,7 @@
 
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/flock_impl.h>
@@ -243,9 +244,293 @@ flk_get_lockmgr_status(void)
 }
 
 /*
- * Routine called from fs_frlock in fs/fs_subr.c
+ * This implements Open File Description (not descriptor) style record locking.
+ * These locks can also be thought of as pid-less since they are not tied to a
+ * specific process, thus they're preserved across fork.
+ *
+ * Called directly from fcntl.
+ *
+ * See reclock() for the implementation of the traditional POSIX style record
+ * locking scheme (pid-ful). This function is derived from reclock() but
+ * simplified and modified to work for OFD style locking.
+ *
+ * The two primary advantages of OFD style of locking are:
+ * 1) It is per-file description, so closing a file descriptor that refers to a
+ *    different file description for the same file will not drop the lock (i.e.
+ *    two open's of the same file get different descriptions but a dup or fork
+ *    will refer to the same description).
+ * 2) Locks are preserved across fork(2).
+ *
+ * Because these locks are per-description a lock ptr lives at the f_filocks
+ * member of the file_t and the lock_descriptor includes a file_t pointer
+ * to enable unique lock identification and management.
+ *
+ * Since these locks are pid-less we cannot do deadlock detection with the
+ * current process-oriented implementation. This is consistent with OFD locking
+ * behavior on other operating systems such as Linux. Since we don't do
+ * deadlock detection we never interact with the process graph that is
+ * maintained for deadlock detection on the traditional POSIX-style locks.
+ *
+ * Future Work:
+ *
+ * The current implementation does not support record locks. That is,
+ * currently the single lock must cover the entire file. This is validated in
+ * fcntl. To support record locks the f_filock pointer in the file_t needs to
+ * be changed to a list of pointers to the locks. That list needs to be
+ * managed independently of the lock list on the vnode itself and it needs to
+ * be maintained as record locks are created, split, coalesced and deleted.
+ *
+ * The current implementation does not support remote file systems (e.g.
+ * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
+ * interact with the NLM is not clear since the NLM protocol/implementation
+ * appears to be oriented around locks associated with a process. A further
+ * problem is that a design is needed for what nlm_send_siglost() should do and
+ * where it will send SIGLOST. More recent versions of Linux apparently try to
+ * emulate OFD locks on NFS by converting them to traditional POSIX style locks
+ * that work with the NLM. It is not clear that this provides the correct
+ * semantics in all cases.
  */
+int
+ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
+{
+	int cmd = 0;
+	vnode_t *vp;
+	lock_descriptor_t	stack_lock_request;
+	lock_descriptor_t	*lock_request;
+	int error = 0;
+	graph_t	*gp;
+	int serialize = 0;
+
+	if (fcmd != F_OFD_GETLK)
+		cmd = SETFLCK;
+
+	if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
+		cmd |= SLPFLCK;
+
+	/* see block comment */
+	VERIFY(lckdat->l_whence == 0);
+	VERIFY(lckdat->l_start == 0);
+	VERIFY(lckdat->l_len == 0);
+
+	vp = fp->f_vnode;
+
+	/*
+	 * For reclock fs_frlock() would normally have set these in a few
+	 * places but for us it's cleaner to centralize it here. Note that
+	 * IGN_PID is -1. We use 0 for our pid-less locks.
+	 */
+	lckdat->l_pid = 0;
+	lckdat->l_sysid = 0;
+
+	/*
+	 * Check access permissions
+	 */
+	if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
+	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
+	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
+		return (EBADF);
+
+	/*
+	 * for query and unlock we use the stack_lock_request
+	 */
+	if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
+		lock_request = &stack_lock_request;
+		(void) bzero((caddr_t)lock_request,
+		    sizeof (lock_descriptor_t));
+
+		/*
+		 * following is added to make the assertions in
+		 * flk_execute_request() pass
+		 */
+		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
+		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
+		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
+		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
+		lock_request->l_status = FLK_INITIAL_STATE;
+	} else {
+		lock_request = flk_get_lock();
+		fp->f_filock = (struct filock *)lock_request;
+	}
+	lock_request->l_state = 0;
+	lock_request->l_vnode = vp;
+	lock_request->l_zoneid = getzoneid();
+	lock_request->l_ofd = fp;
+
+	/*
+	 * Convert the request range into the canonical start and end
+	 * values then check the validity of the lock range.
+	 */
+	error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
+	    &lock_request->l_end, offset);
+	if (error)
+		goto done;
+
+	error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
+	    MAXEND);
+	if (error)
+		goto done;
+
+	ASSERT(lock_request->l_end >= lock_request->l_start);
+
+	lock_request->l_type = lckdat->l_type;
+	if (cmd & SLPFLCK)
+		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
+
+	if (!(cmd & SETFLCK)) {
+		if (lock_request->l_type == F_RDLCK ||
+		    lock_request->l_type == F_WRLCK)
+			lock_request->l_state |= QUERY_LOCK;
+	}
+	lock_request->l_flock = (*lckdat);
+
+	/*
+	 * We are ready for processing the request
+	 */
+
+	if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
+	    nbl_need_check(vp)) {
+		nbl_start_crit(vp, RW_WRITER);
+		serialize = 1;
+	}
 
+	/* Get the lock graph for a particular vnode */
+	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
+
+	mutex_enter(&gp->gp_mutex);
+
+	lock_request->l_state |= REFERENCED_LOCK;
+	lock_request->l_graph = gp;
+
+	switch (lock_request->l_type) {
+	case F_RDLCK:
+	case F_WRLCK:
+		if (IS_QUERY_LOCK(lock_request)) {
+			flk_get_first_blocking_lock(lock_request);
+			if (lock_request->l_ofd != NULL)
+				lock_request->l_flock.l_pid = -1;
+			(*lckdat) = lock_request->l_flock;
+		} else {
+			/* process the request now */
+			error = flk_process_request(lock_request);
+		}
+		break;
+
+	case F_UNLCK:
+		/* unlock request will not block so execute it immediately */
+		error = flk_execute_request(lock_request);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	if (lock_request == &stack_lock_request) {
+		flk_set_state(lock_request, FLK_DEAD_STATE);
+	} else {
+		lock_request->l_state &= ~REFERENCED_LOCK;
+		if ((error != 0) || IS_DELETED(lock_request)) {
+			flk_set_state(lock_request, FLK_DEAD_STATE);
+			flk_free_lock(lock_request);
+		}
+	}
+
+	mutex_exit(&gp->gp_mutex);
+	if (serialize)
+		nbl_end_crit(vp);
+
+	return (error);
+
+done:
+	flk_set_state(lock_request, FLK_DEAD_STATE);
+	if (lock_request != &stack_lock_request)
+		flk_free_lock(lock_request);
+	return (error);
+}
+
+/*
+ * Remove any lock on the vnode belonging to the given file_t.
+ * Called from closef on last close, file_t is locked.
+ *
+ * This is modeled on the cleanlocks() function but only removes the single
+ * lock associated with fp.
+ */
+void
+ofdcleanlock(file_t *fp)
+{
+	lock_descriptor_t *fplock, *lock, *nlock;
+	vnode_t *vp;
+	graph_t	*gp;
+
+	ASSERT(MUTEX_HELD(&fp->f_tlock));
+
+	if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
+		return;
+
+	fp->f_filock = NULL;
+	vp = fp->f_vnode;
+
+	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
+
+	if (gp == NULL)
+		return;
+	mutex_enter(&gp->gp_mutex);
+
+	CHECK_SLEEPING_LOCKS(gp);
+	CHECK_ACTIVE_LOCKS(gp);
+
+	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
+
+	if (lock) {
+		do {
+			nlock = lock->l_next;
+			if (fplock == lock) {
+				CANCEL_WAKEUP(lock);
+				break;
+			}
+			lock = nlock;
+		} while (lock->l_vnode == vp);
+	}
+
+	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
+
+	if (lock) {
+		do {
+			nlock = lock->l_next;
+			if (fplock == lock) {
+				flk_delete_active_lock(lock, 0);
+				flk_wakeup(lock, 1);
+				flk_free_lock(lock);
+				break;
+			}
+			lock = nlock;
+		} while (lock->l_vnode == vp);
+	}
+
+	CHECK_SLEEPING_LOCKS(gp);
+	CHECK_ACTIVE_LOCKS(gp);
+	mutex_exit(&gp->gp_mutex);
+}
+
+/*
+ * Routine called from fs_frlock in fs/fs_subr.c
+ *
+ * This implements traditional POSIX style record locking. The two primary
+ * drawbacks to this style of locking are:
+ * 1) It is per-process, so any close of a file descriptor that refers to the
+ *    file will drop the lock (e.g. lock /etc/passwd, call a library function
+ *    which opens /etc/passwd to read the file, when the library closes it's
+ *    file descriptor the application loses its lock and does not know).
+ * 2) Locks are not preserved across fork(2).
+ *
+ * Because these locks are only assoiciated with a pid they are per-process.
+ * This is why any close will drop the lock and is also why once the process
+ * forks then the lock is no long related to the new process. These locks can
+ * be considered as pid-ful.
+ *
+ * See ofdlock() for the implementation of a similar but improved locking
+ * scheme.
+ */
 int
 reclock(vnode_t		*vp,
 	flock64_t	*lckdat,
@@ -424,6 +709,8 @@ reclock(vnode_t		*vp,
 	case F_WRLCK:
 		if (IS_QUERY_LOCK(lock_request)) {
 			flk_get_first_blocking_lock(lock_request);
+			if (lock_request->l_ofd != NULL)
+				lock_request->l_flock.l_pid = -1;
 			(*lckdat) = lock_request->l_flock;
 			break;
 		}
@@ -712,7 +999,13 @@ flk_get_lock(void)
 void
 flk_free_lock(lock_descriptor_t	*lock)
 {
+	file_t *fp;
+
 	ASSERT(IS_DEAD(lock));
+
+	if ((fp = lock->l_ofd) != NULL)
+		fp->f_filock = NULL;
+
 	if (IS_REFERENCED(lock)) {
 		lock->l_state |= DELETED_LOCK;
 		return;
@@ -1214,7 +1507,7 @@ flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
 	from_lock->l_edge.edge_adj_next = edge;
 
 	/*
-	 * put in in list of to vertex
+	 * put in list of to vertex
 	 */
 
 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
@@ -2601,9 +2894,11 @@ flk_canceled(lock_descriptor_t *request)
 }
 
 /*
- * Remove all the locks for the vnode belonging to the given pid and sysid.
+ * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
+ * That is, since OFD locks are pid-less we'll never match on the incoming
+ * pid. OFD locks are removed earlier in the close() path via closef() and
+ * ofdcleanlock().
  */
-
 void
 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
 {
@@ -2770,6 +3065,14 @@ flk_check_deadlock(lock_descriptor_t *lock)
 	edge_t	*ep, *nep;
 	proc_vertex_t *process_stack;
 
+	/*
+	 * OFD style locks are not associated with any process so there is
+	 * no proc graph for these. Thus we cannot, and do not, do deadlock
+	 * detection.
+	 */
+	if (lock->l_ofd != NULL)
+		return (0);
+
 	STACK_INIT(process_stack);
 
 	mutex_enter(&flock_lock);
@@ -3062,6 +3365,16 @@ flk_update_proc_graph(edge_t *ep, int delete)
 	proc_edge_t *pep, *prevpep;
 
 	mutex_enter(&flock_lock);
+
+	/*
+	 * OFD style locks are not associated with any process so there is
+	 * no proc graph for these.
+	 */
+	if (ep->from_vertex->l_ofd != NULL) {
+		mutex_exit(&flock_lock);
+		return;
+	}
+
 	toproc = flk_get_proc_vertex(ep->to_vertex);
 	fromproc = flk_get_proc_vertex(ep->from_vertex);
 
@@ -3891,6 +4204,7 @@ report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
 	flrp->l_type = blocker->l_type;
 	flrp->l_pid = blocker->l_flock.l_pid;
 	flrp->l_sysid = blocker->l_flock.l_sysid;
+	request->l_ofd = blocker->l_ofd;
 
 	if (IS_LOCKMGR(request)) {
 		flrp->l_start = blocker->l_start;
@@ -4206,6 +4520,10 @@ check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
 {
 	lock_descriptor_t *lock;
 
+	/* Ignore OFD style locks since they're not process-wide. */
+	if (pid == 0)
+		return;
+
 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 
 	if (lock) {