diff options
author | Bryan Cantrill <bryan@joyent.com> | 2014-04-29 08:03:54 +0000 |
---|---|---|
committer | Bryan Cantrill <bryan@joyent.com> | 2014-04-29 08:03:54 +0000 |
commit | 0471db62fb08f915e84fd167dbaa3026c08e8b65 (patch) | |
tree | 90d257c423fc1e3ec515f83275945cc6fe26c178 | |
parent | ed5eb5431dd90bfbd4f7846c63abf835255b336a (diff) | |
download | illumos-joyent-0471db62fb08f915e84fd167dbaa3026c08e8b65.tar.gz |
OS-2963 lx brand needs to support FUTEX_WAKE_OP
-rw-r--r-- | usr/src/uts/common/brand/lx/sys/lx_futex.h | 61 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_futex.c | 315 |
2 files changed, 347 insertions, 29 deletions
diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h index b5c5334bff..4d81386057 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_futex.h +++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h @@ -23,11 +23,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_LX_FUTEX_H #define _SYS_LX_FUTEX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -37,7 +39,60 @@ extern "C" { #define FUTEX_FD 2 #define FUTEX_REQUEUE 3 #define FUTEX_CMP_REQUEUE 4 -#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE_PI + +/* + * Flags that can be OR'd into a futex operation. + */ +#define FUTEX_CMD_MASK 0x007f +#define FUTEX_PRIVATE_FLAG 0x0080 +#define FUTEX_CLOCK_REALTIME 0x0100 + +/* + * FUTEX_WAKE_OP operations + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +/* + * FUTEX_WAKE_OP comparison operations + */ +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ + +/* + * The encoding of the FUTEX_WAKE_OP operation in 32 bits: + * + * +--+-- - --+-- - --+-- - --+-- - --+ + * |S |OP |CMP |OPARG |CMPARG | + * +--+-- - --+-- - --+-- - --+-- - --+ + * |31|30 - 28|27 - 24|23 - 12|11 - 0| + * + * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG. + * (Yes, this whole thing is entirely absurd -- see the block comment in + * lx_futex.c for an explanation of this nonsense.) Macros to extract the + * various components from the operation, given the above encoding: + */ +#define FUTEX_OP_OP(x) (((x) >> 28) & 7) +#define FUTEX_OP_CMP(x) (((x) >> 24) & 15) +#define FUTEX_OP_OPARG(x) (((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \ + ((((x) << 8) >> 20))) +#define FUTEX_OP_CMPARG(x) (((x) << 20) >> 20) #ifdef _KERNEL extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c index ee5fa7993d..428f6b01f0 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_futex.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/systm.h> @@ -81,6 +83,64 @@ * Wake up val1 processes that are waiting on futex1. The call * returns the number of blocked threads that were woken up. * + * FUTEX_WAKE_OP + * + * The implementation of a conditional variable in terms of futexes + * actually uses two futexes: one to assure sequential access and one to + * represent the condition variable. This implementation gives rise to a + * particular performance problem whereby a thread is awoken on the futex + * that represents the condition variable only to have to (potentially) + * immediately wait on the futex that protects the condition variable. + * (Do not confuse the futex that serves to protect the condition variable + * with the pthread_mutex_t associated with pthread_cond_t -- which + * represents a third futex.) To (over)solve this problem, FUTEX_WAKE_OP + * was invented, which performs an atomic compare-and-exchange on a + * second address in a specified fashion (that is, with a specified + * operation). Here are the possible operations (OPARG is defined + * to be 12 bit value embedded in the operation): + * + * - FUTEX_OP_SET: Sets the value at the second address to OPARG + * - FUTEX_OP_ADD: Adds the value to OPARG + * - FUTEX_OP_OR: OR's the value with OPARG + * - FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG + * - FUTEX_OP_XOR: XOR's the value with OPARG + * + * After this compare-and-exchange on the second address, a FUTEX_WAKE is + * performed on the first address and -- if the compare-and-exchange + * matches a specified result based on a specified comparison operation -- + * a FUTEX_WAKE is performed on the second address. Here are the possible + * comparison operations: + * + * - FUTEX_OP_CMP_EQ: If old value is CMPARG, wake + * - FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake + * - FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake + * - FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake + * - FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake + * - FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake + * + * As a practical matter, the only way that this is used (or, some might + * argue, is usable) is by the implementation of pthread_cond_signal(), + * which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the + * futex that protects the condition variable and wake the futex that + * represents the condition variable. The second wake-up is conditional + * because the futex that protects the condition variable (rather than the + * one that represents it) may or may not have waiters. Given that this + * is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for + * five different kinds of operations and six different kinds of + * comparision operations, in practice only one is used. (Namely, setting + * to 0 and waking if the old value is greater than 1 -- which denotes + * that waiters are present and the wakeup should be performed.) Moreover, + * because FUTEX_WAKE_OP does not (and cannot) optimize anything in the + * case that the pthread_mutex_t associated with the pthread_cond_t is + * held at the time of a pthread_cond_signal(), this entire mechanism is + * essentially for naught in this case. As one can imagine (and can + * verify on just about any source base that uses pthread_cond_signal()), + * it is overwhelmingly the common case that the lock associated with the + * pthread_cond_t is held at the time of pthread_cond_signal(), assuring + * that the problem that all of this complexity was designed to solve + * isn't, in fact, solved because the signalled thread simply wakes up + * only to block again on the held mutex. Cue a slow clap! + * * FUTEX_CMP_REQUEUE * * If the value stored in futex1 matches that passed in in val2, wake @@ -105,7 +165,9 @@ * FUTEX_FD * * Return a file descriptor, which can be used to refer to the futex. - * We don't support this operation. + * This operation was broken by design, and was blessedly removed in + * Linux 2.6.26 ("because it was inherently racy"); it should go without + * saying that we don't support this operation. */ /* @@ -261,6 +323,159 @@ futex_wake(memid_t *memid, int wake_threads) return (ret); } +static int +futex_wake_op_execute(int32_t *addr, int32_t val3) +{ + int32_t op = FUTEX_OP_OP(val3); + int32_t cmp = FUTEX_OP_CMP(val3); + int32_t cmparg = FUTEX_OP_CMPARG(val3); + int32_t oparg, oldval, newval; + label_t ljb; + int rval; + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + if (on_fault(&ljb)) + return (set_errno(EFAULT)); + + oparg = FUTEX_OP_OPARG(val3); + + do { + oldval = *addr; + newval = oparg; + + switch (op) { + case FUTEX_OP_SET: + break; + + case FUTEX_OP_ADD: + newval += oparg; + break; + + case FUTEX_OP_OR: + newval |= oparg; + break; + + case FUTEX_OP_ANDN: + newval &= ~oparg; + break; + + case FUTEX_OP_XOR: + newval ^= oparg; + break; + + default: + no_fault(); + return (set_errno(EINVAL)); + } + } while (cas32((uint32_t *)addr, oldval, newval) != oldval); + + no_fault(); + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + rval = (oldval == cmparg); + break; + + case FUTEX_OP_CMP_NE: + rval = (oldval != cmparg); + break; + + case FUTEX_OP_CMP_LT: + rval = (oldval < cmparg); + break; + + case FUTEX_OP_CMP_LE: + rval = (oldval <= cmparg); + break; + + case FUTEX_OP_CMP_GT: + rval = (oldval > cmparg); + break; + + case FUTEX_OP_CMP_GE: + rval = (oldval >= cmparg); + break; + + default: + return (set_errno(EINVAL)); + } + + return (rval); +} + +static int +futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2, + int wake_threads, int wake_threads2, int val3) +{ + kmutex_t *l1, *l2; + int ret = 0, ret2 = 0, wake; + fwaiter_t *fwp, *next; + int index1, index2; + + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(memid2); + + if (index1 == index2) { + l1 = &futex_hash_lock[index1]; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash_lock[index1]; + l2 = &futex_hash_lock[index2]; + } else { + l1 = &futex_hash_lock[index2]; + l2 = &futex_hash_lock[index1]; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + /* LINTED: alignment */ + if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0) + goto out; + + for (fwp = futex_hash[index1]; fwp; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + if (ret++ < wake_threads) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + break; + } + } + + if (!wake) + goto out; + + for (fwp = futex_hash[index2]; fwp; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid2)) + continue; + + futex_hashout(fwp); + if (ret2++ < wake_threads2) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + break; + } + } + + ret += ret2; +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + return (ret); +} + /* * Wake up to wake_threads waiting on the futex at memid. If there are * more than that many threads waiting, requeue the remaining threads on @@ -371,16 +586,17 @@ get_timeout(void *lx_timeout, timestruc_t *timeout) } long -lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, - uintptr_t addr2, int val2) +lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val3) { struct as *as = curproc->p_as; - memid_t memid, requeue_memid; + memid_t memid, memid2; timestruc_t timeout; timestruc_t *tptr = NULL; - int requeue_threads = NULL; - int *requeue_cmp = NULL; + int val2 = NULL; int rval = 0; + int cmd = op & FUTEX_CMD_MASK; + int private = op & FUTEX_PRIVATE_FLAG; /* must be aligned on int boundary */ if (addr & 0x3) @@ -390,6 +606,32 @@ lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, if (cmd < 0 || cmd > FUTEX_MAX_CMD) return (set_errno(EINVAL)); + if (cmd == FUTEX_FD) { + /* + * FUTEX_FD was sentenced to death for grievous crimes of + * semantics against humanity; it has been ripped out of Linux + * and will never be supported by us. + */ + return (set_errno(ENOSYS)); + } + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_BITSET: + case FUTEX_WAKE_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + /* + * These are operations that we don't currently support, but + * may well need to in the future. For now, callers need to + * deal with these being missing -- but if and as that changes, + * they may well need to be implemented. + */ + return (set_errno(ENOSYS)); + } + /* Copy in the timeout structure from userspace. */ if (cmd == FUTEX_WAIT && lx_timeout != NULL) { rval = get_timeout((timespec_t *)lx_timeout, &timeout); @@ -398,33 +640,49 @@ lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, tptr = &timeout; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) { - if (cmd == FUTEX_CMP_REQUEUE) - requeue_cmp = &val2; - + switch (cmd) { + case FUTEX_REQUEUE: + case FUTEX_CMP_REQUEUE: + case FUTEX_WAKE_OP: /* - * lx_timeout is nominally a pointer to a userspace - * address. For these two commands, it actually contains - * an integer which indicates the maximum number of threads - * to requeue. This is horrible, and I'm sorry. + * lx_timeout is nominally a pointer to a userspace address. + * For several commands, however, it actually contains + * an additional interage parameter. This is horrible, and + * the people who did this to us should be sorry. */ - requeue_threads = (int)lx_timeout; + val2 = (int)lx_timeout; } /* * Translate the process-specific, user-space futex virtual - * address(es) to universal memid. + * address(es) to a universal memid. If the private bit is set, we + * can just use our as plus the virtual address, saving quite a bit + * of effort. */ - rval = as_getmemid(as, (void *)addr, &memid); - if (rval != 0) - return (set_errno(rval)); - - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) { - rval = as_getmemid(as, (void *)addr2, &requeue_memid); - if (rval) + if (private) { + memid.val[0] = (uintptr_t)as; + memid.val[1] = (uintptr_t)addr; + } else { + rval = as_getmemid(as, (void *)addr, &memid); + if (rval != 0) return (set_errno(rval)); } + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) { + if (addr2 & 0x3) + return (set_errno(EINVAL)); + + if (private) { + memid2.val[0] = (uintptr_t)as; + memid2.val[1] = (uintptr_t)addr2; + } else { + rval = as_getmemid(as, (void *)addr2, &memid2); + if (rval) + return (set_errno(rval)); + } + } + switch (cmd) { case FUTEX_WAIT: rval = futex_wait(&memid, (void *)addr, val, tptr); @@ -434,10 +692,15 @@ lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, rval = futex_wake(&memid, val); break; + case FUTEX_WAKE_OP: + rval = futex_wake_op(&memid, (void *)addr2, &memid2, + val, val2, val3); + break; + case FUTEX_CMP_REQUEUE: case FUTEX_REQUEUE: - rval = futex_requeue(&memid, &requeue_memid, val, - requeue_threads, (void *)addr2, requeue_cmp); + rval = futex_requeue(&memid, &memid2, val, + val2, (void *)addr2, &val3); break; } |