summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libc')
-rw-r--r--usr/src/lib/libc/Makefile.targ22
-rw-r--r--usr/src/lib/libc/README53
-rw-r--r--usr/src/lib/libc/amd64/Makefile29
-rw-r--r--usr/src/lib/libc/amd64/gen/siglongjmp.c13
-rw-r--r--usr/src/lib/libc/common/sys/__clock_timer.s14
-rw-r--r--usr/src/lib/libc/common/sys/__signotify.s13
-rw-r--r--usr/src/lib/libc/common/sys/__sigrt.s16
-rw-r--r--usr/src/lib/libc/common/sys/kaio.s21
-rw-r--r--usr/src/lib/libc/i386/Makefile.com29
-rw-r--r--usr/src/lib/libc/i386/gen/siglongjmp.c13
-rw-r--r--usr/src/lib/libc/inc/asyncio.h346
-rw-r--r--usr/src/lib/libc/inc/mtlib.h11
-rw-r--r--usr/src/lib/libc/inc/rtsched.h44
-rw-r--r--usr/src/lib/libc/inc/synonyms.h47
-rw-r--r--usr/src/lib/libc/inc/thr_debug.h44
-rw-r--r--usr/src/lib/libc/inc/thr_uberdata.h69
-rw-r--r--usr/src/lib/libc/inc/thread_pool.h74
-rw-r--r--usr/src/lib/libc/port/aio/aio.c2202
-rw-r--r--usr/src/lib/libc/port/aio/aio_alloc.c435
-rw-r--r--usr/src/lib/libc/port/aio/posix_aio.c1758
-rw-r--r--usr/src/lib/libc/port/gen/event_port.c11
-rw-r--r--usr/src/lib/libc/port/llib-lc5
-rw-r--r--usr/src/lib/libc/port/rt/clock_timer.c179
-rw-r--r--usr/src/lib/libc/port/rt/fallocate.c72
-rw-r--r--usr/src/lib/libc/port/rt/mqueue.c1101
-rw-r--r--usr/src/lib/libc/port/rt/pos4obj.c482
-rw-r--r--usr/src/lib/libc/port/rt/pos4obj.h78
-rw-r--r--usr/src/lib/libc/port/rt/sched.c552
-rw-r--r--usr/src/lib/libc/port/rt/sem.c367
-rw-r--r--usr/src/lib/libc/port/rt/shm.c95
-rw-r--r--usr/src/lib/libc/port/rt/sigev_thread.c715
-rw-r--r--usr/src/lib/libc/port/rt/sigev_thread.h117
-rw-r--r--usr/src/lib/libc/port/sys/fsync.c19
-rw-r--r--usr/src/lib/libc/port/sys/sigstack.c7
-rw-r--r--usr/src/lib/libc/port/threads/assfail.c43
-rw-r--r--usr/src/lib/libc/port/threads/pthr_attr.c5
-rw-r--r--usr/src/lib/libc/port/threads/pthread.c4
-rw-r--r--usr/src/lib/libc/port/threads/rtsched.c120
-rw-r--r--usr/src/lib/libc/port/threads/scalls.c48
-rw-r--r--usr/src/lib/libc/port/threads/sigaction.c57
-rw-r--r--usr/src/lib/libc/port/threads/spawn.c16
-rw-r--r--usr/src/lib/libc/port/threads/synch.c71
-rw-r--r--usr/src/lib/libc/port/threads/thr.c30
-rw-r--r--usr/src/lib/libc/port/tpool/thread_pool.c560
-rw-r--r--usr/src/lib/libc/port/tpool/thread_pool_impl.h99
-rw-r--r--usr/src/lib/libc/sparc/Makefile29
-rw-r--r--usr/src/lib/libc/sparcv9/Makefile29
-rw-r--r--usr/src/lib/libc/spec/Makefile.targ11
-rw-r--r--usr/src/lib/libc/spec/aio.spec83
-rw-r--r--usr/src/lib/libc/spec/gen.spec25
-rw-r--r--usr/src/lib/libc/spec/private.spec99
-rw-r--r--usr/src/lib/libc/spec/rt.spec641
-rw-r--r--usr/src/lib/libc/spec/sys.spec7
-rw-r--r--usr/src/lib/libc/spec/threads.spec14
54 files changed, 10626 insertions, 418 deletions
diff --git a/usr/src/lib/libc/Makefile.targ b/usr/src/lib/libc/Makefile.targ
index e3bb69581a..26e8812bd7 100644
--- a/usr/src/lib/libc/Makefile.targ
+++ b/usr/src/lib/libc/Makefile.targ
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
@@ -200,6 +199,21 @@ pics/%_c89.o: $(LIBCBASE)/../port/stdio/%.c
$(COMPILE.c) -o $@ $<
$(POST_PROCESS_O)
+# aio rules
+pics/%.o: $(LIBCBASE)/../port/aio/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+# rt rules
+pics/%.o: $(LIBCBASE)/../port/rt/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+# tpool rules
+pics/%.o: $(LIBCBASE)/../port/tpool/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
# threads rules
pics/%.o: $(LIBCBASE)/../port/threads/%.c
$(COMPILE.c) -o $@ $<
diff --git a/usr/src/lib/libc/README b/usr/src/lib/libc/README
index f3c5ab146d..289f766aef 100644
--- a/usr/src/lib/libc/README
+++ b/usr/src/lib/libc/README
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
@@ -64,9 +63,12 @@ fork-safe) and in which the calling thread has all signals deferred
However, certain rules apply to the code within these critical regions:
- - The code must be of guaranteed short duration; no
- calls to interfaces that might block indefinitely are
- allowed. This means no calls into stdio or syslog().
+ - The code must be of guaranteed short duration; no calls
+ to interfaces that might block indefinitely are allowed.
+ This means no calls into stdio or syslog() and no calls
+ to cond_wait() unless there is a guarantee of an almost-
+ immediate call to cond_signal() or cond_broadcast()
+ from elsewhere.
- The code cannot call any non-l* synchronization
primitives (mutex_lock(), _private_mutex_lock(),
@@ -197,3 +199,40 @@ conditions such as the setting of CFLAGS and CPPFLAGS for the libc_i18n
stuff need to be compatible with the ones for the libc stuff. Whenever
changes that affect the compilation conditions of libc occur, the changes
should be propagated to libc_i18n.
+
+-----
+
+The putback of the project:
+ 6416832 libaio and librt can and should be folded into libc
+introduced several libc-private locking interfaces:
+ void sig_mutex_lock(mutex_t *);
+ void sig_mutex_unlock(mutex_t *);
+ int sig_mutex_trylock(mutex_t *);
+ int sig_cond_wait(cond_t *, mutex_t *);
+ int sig_cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *);
+which are declared in both "thr_uberdata.h" and "mtlib.h".
+
+They are used in specialized code in libc, like the asynchronous i/o code.
+Unlike the lmutex_lock() and lmutex_unlock() interfaces described above,
+these interfaces do not define critical regions, but signals are
+deferred while locks acquired by these functions are held, making
+their use be async-signal safe. Calls to malloc(), calloc(), realloc(),
+and free() are permissible while holding such locks.
+
+These interfaces were brought over from code in the former libaio
+and librt and are necessary because, where they are used, the code
+must execute potentially long-term waits and must be cancelable.
+sig_cond_wait() and sig_cond_reltimedwait() are cancellation points.
+
+These interfaces are available for other uses inside libc, as
+the need arises. (There is no need if the code does not perform
+long-term waits.) Just follow a few rules to be self-consistent:
+ - Don't mix calls to mutex_[un]lock(), lmutex_[un]lock() and
+ sig_mutex_[un]lock() on the same mutex.
+ - Don't call cond_wait() with a mutex acquired by sig_mutex_lock();
+ call sig_cond_wait() or sig_cond_reltimedwait().
+ - Use pthread_cleanup_push() and pthread_cleanup_pop() to make
+ your code cancellation-safe.
+ - The sig_*() interfaces are not in themselves fork-safe.
+ You have to employ other logic to make your code fork-safe.
+ See the tail of postfork1_child() for examples.
diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile
index 0238a550ed..4db5f28fcb 100644
--- a/usr/src/lib/libc/amd64/Makefile
+++ b/usr/src/lib/libc/amd64/Makefile
@@ -711,6 +711,24 @@ PORTI18N_COND= \
wcstol_longlong.o \
wcstoul_longlong.o
+AIOOBJS= \
+ aio.o \
+ aio_alloc.o \
+ posix_aio.o \
+
+RTOBJS= \
+ clock_timer.o \
+ fallocate.o \
+ mqueue.o \
+ pos4obj.o \
+ sched.o \
+ sem.o \
+ shm.o \
+ sigev_thread.o
+
+TPOOLOBJS= \
+ thread_pool.o
+
THREADSOBJS= \
alloc.o \
assfail.o \
@@ -836,6 +854,9 @@ MOSTOBJS= \
$(PORTSTDIO_W) \
$(PORTSYS) \
$(PORTSYS64) \
+ $(AIOOBJS) \
+ $(RTOBJS) \
+ $(TPOOLOBJS) \
$(THREADSOBJS) \
$(THREADSMACHOBJS) \
$(THREADSASMOBJS) \
@@ -939,6 +960,9 @@ SRCS= \
$(PORTREGEX:%.o=../port/regex/%.c) \
$(PORTSTDIO:%.o=../port/stdio/%.c) \
$(PORTSYS:%.o=../port/sys/%.c) \
+ $(AIOOBJS:%.o=../port/aio/%.c) \
+ $(RTOBJS:%.o=../port/rt/%.c) \
+ $(TPOOLOBJS:%.o=../port/tpool/%.c) \
$(THREADSOBJS:%.o=../port/threads/%.c) \
$(THREADSMACHOBJS:%.o=threads/%.c) \
$(UNWINDMACHOBJS:%.o=unwind/%.c) \
@@ -966,6 +990,7 @@ $(MAPFILE):
# Files which need the threads .il inline template
TIL= \
+ aio.o \
alloc.o \
assfail.o \
atexit.o \
@@ -974,7 +999,9 @@ TIL= \
door_calls.o \
errno.o \
lwp.o \
+ ma.o \
machdep.o \
+ posix_aio.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
@@ -987,12 +1014,14 @@ TIL= \
scalls.o \
sema.o \
sigaction.o \
+ sigev_thread.o \
spawn.o \
stack.o \
synch.o \
tdb_agent.o \
thr.o \
thread_interface.o \
+ thread_pool.o \
thrp_unwind.o \
tls.o \
tsd.o
diff --git a/usr/src/lib/libc/amd64/gen/siglongjmp.c b/usr/src/lib/libc/amd64/gen/siglongjmp.c
index fd9860aad8..4bc4c579a4 100644
--- a/usr/src/lib/libc/amd64/gen/siglongjmp.c
+++ b/usr/src/lib/libc/amd64/gen/siglongjmp.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,17 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
+#pragma weak siglongjmp = _siglongjmp
+
+#include "synonyms.h"
#include <sys/types.h>
#include <sys/ucontext.h>
#include <setjmp.h>
@@ -33,8 +36,6 @@
extern int _setcontext(const ucontext_t *);
-#pragma weak siglongjmp = _siglongjmp
-
void
_siglongjmp(sigjmp_buf env, int val)
{
diff --git a/usr/src/lib/libc/common/sys/__clock_timer.s b/usr/src/lib/libc/common/sys/__clock_timer.s
index 4c4e917836..5188262570 100644
--- a/usr/src/lib/libc/common/sys/__clock_timer.s
+++ b/usr/src/lib/libc/common/sys/__clock_timer.s
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -110,11 +110,11 @@
/*
* int
- * ___nanosleep(const timespec_t *rqtp, timespec_t *rmtp)
+ * __nanosleep(const timespec_t *rqtp, timespec_t *rmtp)
*/
- ENTRY(___nanosleep)
+ ENTRY(__nanosleep)
SYSTRAP_RVAL1(nanosleep)
SYSLWPERR
RET
- SET_SIZE(___nanosleep)
+ SET_SIZE(__nanosleep)
diff --git a/usr/src/lib/libc/common/sys/__signotify.s b/usr/src/lib/libc/common/sys/__signotify.s
index f49d5eb297..057a00ad45 100644
--- a/usr/src/lib/libc/common/sys/__signotify.s
+++ b/usr/src/lib/libc/common/sys/__signotify.s
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -28,9 +28,8 @@
.file "%M%"
-/* unpublished system call for librt -- __signotify */
-/* int _signotify (int cmd, siginfo_t *siginfo, */
-/* signotify_id_t *sn_id); */
+/* unpublished system call for POSIX message queues -- __signotify */
+/* int __signotify (int cmd, siginfo_t *siginfo, signotify_id_t *sn_id); */
#include "SYS.h"
diff --git a/usr/src/lib/libc/common/sys/__sigrt.s b/usr/src/lib/libc/common/sys/__sigrt.s
index df1154abd0..0ce63adb4e 100644
--- a/usr/src/lib/libc/common/sys/__sigrt.s
+++ b/usr/src/lib/libc/common/sys/__sigrt.s
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -32,7 +32,7 @@
/*
* int
- * __sigqueue(pid_t pid, int signo, void *value, int si_code)
+ * __sigqueue(pid_t pid, int signo, void *value, int si_code, int block)
*/
SYSCALL2_RVAL1(__sigqueue,sigqueue)
RETC
@@ -40,9 +40,9 @@
/*
* int
- * ___sigtimedwait(const sigset_t *set, siginfo_t *info,
+ * __sigtimedwait(const sigset_t *set, siginfo_t *info,
* const timespec_t *timeout)
*/
- SYSCALL2_RVAL1(___sigtimedwait,sigtimedwait)
+ SYSCALL2_RVAL1(__sigtimedwait,sigtimedwait)
RET
- SET_SIZE(___sigtimedwait)
+ SET_SIZE(__sigtimedwait)
diff --git a/usr/src/lib/libc/common/sys/kaio.s b/usr/src/lib/libc/common/sys/kaio.s
index cb75d3e2d5..1cd3810403 100644
--- a/usr/src/lib/libc/common/sys/kaio.s
+++ b/usr/src/lib/libc/common/sys/kaio.s
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,12 +18,12 @@
*
* CDDL HEADER END
*/
+
/* Copyright (c) 1988 AT&T */
/* All Rights Reserved */
-
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,14 +32,12 @@
.file "%M%"
/* C library -- kaio */
-/* intptr_t kaio (); */
-
-#include <sys/asm_linkage.h>
-
- ANSI_PRAGMA_WEAK(kaio,function)
+/* intptr_t _kaio (); */
#include "SYS.h"
- SYSCALL_RVAL1(kaio)
+ ENTRY(_kaio)
+ SYSTRAP_RVAL1(kaio)
+ SYSCERROR
RET
- SET_SIZE(kaio)
+ SET_SIZE(_kaio)
diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com
index 4c40fc780e..8613ab3972 100644
--- a/usr/src/lib/libc/i386/Makefile.com
+++ b/usr/src/lib/libc/i386/Makefile.com
@@ -751,6 +751,24 @@ PORTI18N_COND= \
wcstol_longlong.o \
wcstoul_longlong.o
+AIOOBJS= \
+ aio.o \
+ aio_alloc.o \
+ posix_aio.o \
+
+RTOBJS= \
+ clock_timer.o \
+ fallocate.o \
+ mqueue.o \
+ pos4obj.o \
+ sched.o \
+ sem.o \
+ shm.o \
+ sigev_thread.o
+
+TPOOLOBJS= \
+ thread_pool.o
+
THREADSOBJS= \
alloc.o \
assfail.o \
@@ -871,6 +889,9 @@ MOSTOBJS= \
$(PORTSTDIO_W) \
$(PORTSYS) \
$(PORTSYS64) \
+ $(AIOOBJS) \
+ $(RTOBJS) \
+ $(TPOOLOBJS) \
$(THREADSOBJS) \
$(THREADSMACHOBJS) \
$(THREADSASMOBJS) \
@@ -988,6 +1009,9 @@ SRCS= \
$(PORTREGEX:%.o=../port/regex/%.c) \
$(PORTSTDIO:%.o=../port/stdio/%.c) \
$(PORTSYS:%.o=../port/sys/%.c) \
+ $(AIOOBJS:%.o=../port/aio/%.c) \
+ $(RTOBJS:%.o=../port/rt/%.c) \
+ $(TPOOLOBJS:%.o=../port/tpool/%.c) \
$(THREADSOBJS:%.o=../port/threads/%.c) \
$(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \
$(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \
@@ -1016,6 +1040,7 @@ $(MAPFILE):
# Files which need the threads .il inline template
TIL= \
+ aio.o \
alloc.o \
assfail.o \
atexit.o \
@@ -1024,7 +1049,9 @@ TIL= \
door_calls.o \
errno.o \
lwp.o \
+ ma.o \
machdep.o \
+ posix_aio.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
@@ -1037,12 +1064,14 @@ TIL= \
scalls.o \
sema.o \
sigaction.o \
+ sigev_thread.o \
spawn.o \
stack.o \
synch.o \
tdb_agent.o \
thr.o \
thread_interface.o \
+ thread_pool.o \
tls.o \
tsd.o \
unwind.o
diff --git a/usr/src/lib/libc/i386/gen/siglongjmp.c b/usr/src/lib/libc/i386/gen/siglongjmp.c
index ff40ea8f98..1b3296d93d 100644
--- a/usr/src/lib/libc/i386/gen/siglongjmp.c
+++ b/usr/src/lib/libc/i386/gen/siglongjmp.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -30,6 +30,9 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+#pragma weak siglongjmp = _siglongjmp
+
+#include "synonyms.h"
#include <sys/types.h>
#include <sys/ucontext.h>
#include <setjmp.h>
@@ -37,8 +40,6 @@
extern int _setcontext(const ucontext_t *);
-#pragma weak siglongjmp = _siglongjmp
-
void
_siglongjmp(sigjmp_buf env, int val)
{
diff --git a/usr/src/lib/libc/inc/asyncio.h b/usr/src/lib/libc/inc/asyncio.h
new file mode 100644
index 0000000000..02d33cd700
--- /dev/null
+++ b/usr/src/lib/libc/inc/asyncio.h
@@ -0,0 +1,346 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ASYNCIO_H
+#define _ASYNCIO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <thread.h>
+#include <pthread.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <siginfo.h>
+#include <aio.h>
+#include <limits.h>
+#include <ucontext.h>
+#include <sys/asynch.h>
+#include <sys/mman.h>
+
+#if !defined(_LP64)
+#define AIOSTKSIZE (64 * 1024)
+#else
+#define AIOSTKSIZE (128 * 1024)
+#endif
+
+#define SIGAIOCANCEL SIGLWP /* special aio cancelation signal */
+
+#define AIO_WAITN_MAXIOCBS 32768 /* max. iocbs per system call */
+
+/*
+ * Declare structure types. The structures themselves are defined below.
+ */
+typedef struct aio_args aio_args_t;
+typedef struct aio_lio aio_lio_t;
+typedef struct notif_param notif_param_t;
+typedef struct aio_req aio_req_t;
+typedef struct aio_worker aio_worker_t;
+typedef struct aio_hash aio_hash_t;
+
+struct aio_args {
+ int fd;
+ caddr_t buf;
+ size_t bufsz;
+ offset_t offset;
+};
+
+/*
+ * list head for UFS list I/O
+ */
+struct aio_lio {
+ mutex_t lio_mutex; /* list mutex */
+ cond_t lio_cond_cv; /* list notification for I/O done */
+ aio_lio_t *lio_next; /* pointer to next on freelist */
+ char lio_mode; /* LIO_WAIT/LIO_NOWAIT */
+ char lio_canned; /* lio was canceled */
+ char lio_largefile; /* largefile operation */
+ char lio_waiting; /* waiting in __lio_listio() */
+ int lio_nent; /* Number of list I/O's */
+ int lio_refcnt; /* outstanding I/O's */
+ int lio_event; /* Event number for notification */
+ int lio_port; /* Port number for notification */
+ int lio_signo; /* Signal number for notification */
+ union sigval lio_sigval; /* Signal parameter */
+ uintptr_t lio_object; /* for SIGEV_THREAD or SIGEV_PORT */
+ struct sigevent *lio_sigevent; /* Notification function and attr. */
+};
+
+/*
+ * Notification parameters
+ */
+struct notif_param {
+ int np_signo; /* SIGEV_SIGNAL */
+ int np_port; /* SIGEV_THREAD or SIGEV_PORT */
+ void *np_user;
+ int np_event;
+ uintptr_t np_object;
+ int np_lio_signo; /* listio: SIGEV_SIGNAL */
+ int np_lio_port; /* listio: SIGEV_THREAD or SIGEV_PORT */
+ void *np_lio_user;
+ int np_lio_event;
+ uintptr_t np_lio_object;
+};
+
+struct aio_req {
+ /*
+ * fields protected by _aio_mutex lock.
+ */
+ aio_req_t *req_link; /* hash/freelist chain link */
+ /*
+ * when req is on the doneq, then req_next is protected by
+ * the _aio_mutex lock. when the req is on a work q, then
+ * req_next is protected by a worker's work_qlock1 lock.
+ */
+ aio_req_t *req_next; /* request/done queue link */
+ aio_req_t *req_prev; /* double linked list */
+ /*
+ * fields protected by a worker's work_qlock1 lock.
+ */
+ char req_state; /* AIO_REQ_QUEUED, ... */
+ /*
+ * fields require no locking.
+ */
+ char req_type; /* AIO_POSIX_REQ or not */
+ char req_largefile; /* largefile operation */
+ char req_op; /* AIOREAD, etc. */
+ aio_worker_t *req_worker; /* associate request with worker */
+ aio_result_t *req_resultp; /* address of result buffer */
+ aio_args_t req_args; /* arglist */
+ aio_lio_t *req_head; /* list head for LIO */
+ struct sigevent req_sigevent;
+ void *req_aiocbp; /* ptr to aiocb or aiocb64 */
+ notif_param_t req_notify; /* notification parameters */
+};
+
+/* special lio type that destroys itself when lio refcnt becomes zero */
+#define LIO_FSYNC LIO_WAIT+1
+#define LIO_DESTROY LIO_FSYNC+1
+
+/* lio flags */
+#define LIO_FSYNC_CANCELED 0x1
+
+/* values for aio_state */
+
+#define AIO_REQ_QUEUED 1
+#define AIO_REQ_INPROGRESS 2
+#define AIO_REQ_CANCELED 3
+#define AIO_REQ_DONE 4
+#define AIO_REQ_FREE 5
+#define AIO_REQ_DONEQ 6
+
+/* use KAIO in _aio_rw() */
+#define AIO_NO_KAIO 0x0
+#define AIO_KAIO 0x1
+#define AIO_NO_DUPS 0x2
+
+#define AIO_POSIX_REQ 0x1
+
+#define CHECK 1
+#define NOCHECK 2
+#define CHECKED 3
+#define USERAIO 4
+#define USERAIO_DONE 5
+
+/* values for _aio_flags */
+
+/* if set, _aiodone() notifies aio_waitn about done requests */
+#define AIO_WAIT_INPROGRESS 0x1
+/* if set, _aiodone() wakes up functions waiting for completed I/Os */
+#define AIO_IO_WAITING 0x2
+#define AIO_LIB_WAITN 0x4 /* aio_waitn in progress */
+#define AIO_LIB_WAITN_PENDING 0x8 /* aio_waitn requests pending */
+
+/*
+ * Before a kaio() system call, the fd will be checked
+ * to ensure that kernel async. I/O is supported for this file.
+ * The only way to find out is if a kaio() call returns ENOTSUP,
+ * so the default will always be to try the kaio() call. Only in
+ * the specific instance of a kaio() call returning ENOTSUP
+ * will we stop submitting kaio() calls for that fd.
+ * If the fd is outside the array bounds, we will allow the kaio()
+ * call.
+ *
+ * The only way that an fd entry can go from ENOTSUP to supported
+ * is if that fd is freed up by a close(), and close will clear
+ * the entry for that fd.
+ *
+ * Each fd gets a bit in the array _kaio_supported[].
+ *
+ * uint32_t _kaio_supported[MAX_KAIO_FDARRAY_SIZE];
+ *
+ * Array is MAX_KAIO_ARRAY_SIZE of 32-bit elements, for 8kb.
+ * If more than (MAX_KAIO_FDARRAY_SIZE * KAIO_FDARRAY_ELEM_SIZE)
+ * files are open, this can be expanded.
+ */
+
+#define MAX_KAIO_FDARRAY_SIZE 2048
+#define KAIO_FDARRAY_ELEM_SIZE WORD_BIT /* uint32_t */
+
+#define MAX_KAIO_FDS (MAX_KAIO_FDARRAY_SIZE * KAIO_FDARRAY_ELEM_SIZE)
+
+#define VALID_FD(fdes) ((fdes) >= 0 && (fdes) < MAX_KAIO_FDS)
+
+#define KAIO_SUPPORTED(fdes) \
+ (!VALID_FD(fdes) || \
+ ((_kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] & \
+ (uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))) == 0))
+
+#define SET_KAIO_NOT_SUPPORTED(fdes) \
+ if (VALID_FD(fdes)) \
+ _kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] |= \
+ (uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))
+
+#define CLEAR_KAIO_SUPPORTED(fdes) \
+ if (VALID_FD(fdes)) \
+ _kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] &= \
+ ~(uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))
+
+struct aio_worker {
+ aio_worker_t *work_forw; /* forward link in list of workers */
+ aio_worker_t *work_backw; /* backwards link in list of workers */
+ mutex_t work_qlock1; /* lock for work queue 1 */
+ cond_t work_idle_cv; /* place to sleep when idle */
+ aio_req_t *work_head1; /* head of work request queue 1 */
+ aio_req_t *work_tail1; /* tail of work request queue 1 */
+ aio_req_t *work_next1; /* work queue one's next pointer */
+ aio_req_t *work_prev1; /* last request done from queue 1 */
+ aio_req_t *work_req; /* active work request */
+ thread_t work_tid; /* worker's thread-id */
+ int work_count1; /* length of work queue one */
+ int work_done1; /* number of requests done */
+ int work_minload1; /* min length of queue */
+ int work_idleflg; /* when set, worker is idle */
+ sigjmp_buf work_jmp_buf; /* cancellation point */
+};
+
+struct aio_hash { /* resultp hash table */
+ mutex_t hash_lock;
+ aio_req_t *hash_ptr;
+#if !defined(_LP64)
+ void *hash_pad; /* ensure sizeof (aio_hash_t) == 32 */
+#endif
+};
+
+extern aio_hash_t *_aio_hash;
+
+#define HASHSZ 2048 /* power of 2 */
+#define AIOHASH(resultp) ((((uintptr_t)(resultp) >> 17) ^ \
+ ((uintptr_t)(resultp) >> 2)) & (HASHSZ - 1))
+#define POSIX_AIO(x) ((x)->req_type == AIO_POSIX_REQ)
+
+extern int __uaio_init(void);
+extern void _kaio_init(void);
+extern intptr_t _kaio(int, ...);
+extern int _aiorw(int, caddr_t, int, offset_t, int, aio_result_t *, int);
+extern int _aio_rw(aiocb_t *, aio_lio_t *, aio_worker_t **, int, int);
+#if !defined(_LP64)
+extern int _aio_rw64(aiocb64_t *, aio_lio_t *, aio_worker_t **, int, int);
+#endif
+extern int _aio_create_worker(aio_req_t *, int);
+extern int _aio_cancel_req(aio_worker_t *, aio_req_t *, int *, int *);
+extern int aiocancel_all(int);
+extern void aio_panic(const char *);
+extern aio_req_t *_aio_hash_find(aio_result_t *);
+extern aio_req_t *_aio_hash_del(aio_result_t *);
+extern void _aio_req_mark_done(aio_req_t *);
+extern void _aio_waitn_wakeup(void);
+extern aio_worker_t *_aio_worker_alloc(void);
+extern void _aio_worker_free(void *);
+extern aio_req_t *_aio_req_alloc(void);
+extern void _aio_req_free(aio_req_t *);
+extern aio_lio_t *_aio_lio_alloc(void);
+extern void _aio_lio_free(aio_lio_t *);
+extern int _aio_idle(aio_worker_t *);
+extern void *_aio_do_request(void *);
+extern void *_aio_do_notify(void *);
+extern void _lio_remove(aio_req_t *);
+extern aio_req_t *_aio_req_remove(aio_req_t *);
+extern int _aio_get_timedelta(timespec_t *, timespec_t *);
+extern aio_result_t *_aio_req_done(void);
+extern void _aio_set_result(aio_req_t *, ssize_t, int);
+extern int _aio_sigev_thread_init(struct sigevent *);
+extern int _aio_sigev_thread(aiocb_t *);
+#if !defined(_LP64)
+extern int _aio_sigev_thread64(aiocb64_t *);
+#endif
+
+extern aio_worker_t *_kaiowp; /* points to kaio cleanup thread */
+extern aio_worker_t *__workers_rw; /* list of all rw workers */
+extern aio_worker_t *__nextworker_rw; /* worker chosen for next rw request */
+extern int __rw_workerscnt; /* number of rw workers */
+extern aio_worker_t *__workers_no; /* list of all notification workers */
+extern aio_worker_t *__nextworker_no; /* worker chosen, next notification */
+extern int __no_workerscnt; /* number of notification workers */
+extern mutex_t __aio_initlock; /* makes aio initialization atomic */
+extern cond_t __aio_initcv;
+extern int __aio_initbusy;
+extern mutex_t __aio_mutex; /* global aio lock */
+extern cond_t _aio_iowait_cv; /* wait for userland I/Os */
+extern cond_t _aio_waitn_cv; /* wait for end of aio_waitn */
+extern int _max_workers; /* max number of workers permitted */
+extern int _min_workers; /* min number of workers */
+extern sigset_t _worker_set; /* worker's signal mask */
+extern int _aio_worker_cnt; /* number of AIO workers */
+extern int _sigio_enabled; /* when set, send SIGIO signal */
+extern pid_t __pid; /* process's PID */
+extern int __uaio_ok; /* indicates if aio is initialized */
+extern int _kaio_ok; /* indicates if kaio is initialized */
+extern pthread_key_t _aio_key; /* for thread-specific data */
+extern aio_req_t *_aio_done_tail; /* list of done requests */
+extern aio_req_t *_aio_done_head;
+extern aio_req_t *_aio_doneq;
+extern int _aio_freelist_cnt;
+extern int _aio_allocated_cnt;
+extern int _aio_donecnt;
+extern int _aio_doneq_cnt;
+extern int _aio_waitncnt; /* # of requests for aio_waitn */
+extern int _aio_outstand_cnt; /* # of outstanding requests */
+extern int _kaio_outstand_cnt; /* # of outstanding kaio requests */
+extern int _aio_req_done_cnt; /* req. done but not in "done queue" */
+extern int _aio_kernel_suspend; /* active kernel kaio calls */
+extern int _aio_suscv_cnt; /* aio_suspend calls waiting on cv's */
+extern int _aiowait_flag; /* when set, aiowait() is inprogress */
+extern int _aio_flags; /* see defines, above */
+extern uint32_t *_kaio_supported;
+
+extern const sigset_t maskset; /* all maskable signals */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ASYNCIO_H */
diff --git a/usr/src/lib/libc/inc/mtlib.h b/usr/src/lib/libc/inc/mtlib.h
index 89c2376949..d864e8e75a 100644
--- a/usr/src/lib/libc/inc/mtlib.h
+++ b/usr/src/lib/libc/inc/mtlib.h
@@ -57,6 +57,15 @@ extern int __rw_unlock(rwlock_t *);
extern void lrw_rdlock(rwlock_t *);
extern void lrw_wrlock(rwlock_t *);
extern void lrw_unlock(rwlock_t *);
+extern void sig_mutex_lock(mutex_t *);
+extern void sig_mutex_unlock(mutex_t *);
+extern int sig_mutex_trylock(mutex_t *);
+extern int sig_cond_wait(cond_t *, mutex_t *);
+extern int sig_cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *);
+
+/* the private libc thread-safe allocator */
+extern void *lmalloc(size_t);
+extern void lfree(void *, size_t);
/* the rest are public functions */
extern int _mutex_init(mutex_t *, int, void *);
@@ -91,6 +100,8 @@ extern thread_t _thr_self(void);
extern void _thr_exit(void *);
extern size_t _thr_min_stack(void);
extern int _thr_kill(thread_t, int);
+extern int _thr_create(void *, size_t, void *(*)(void *), void *, long,
+ thread_t *);
extern int _thr_keycreate(thread_key_t *, void (*)(void *));
extern int _thr_setspecific(thread_key_t, void *);
extern int _thr_getspecific(thread_key_t, void **);
diff --git a/usr/src/lib/libc/inc/rtsched.h b/usr/src/lib/libc/inc/rtsched.h
new file mode 100644
index 0000000000..90ae11c3b2
--- /dev/null
+++ b/usr/src/lib/libc/inc/rtsched.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _RTSCHED_H
+#define _RTSCHED_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/priocntl.h>
+
+/*
+ * This definition is private to libc but is used in more than one subsystem.
+ */
+struct pcclass {
+ short pcc_state;
+ pri_t pcc_primin;
+ pri_t pcc_primax;
+ pcinfo_t pcc_info;
+};
+
+#endif /* _RTSCHED_H */
diff --git a/usr/src/lib/libc/inc/synonyms.h b/usr/src/lib/libc/inc/synonyms.h
index 179f25f627..4de926dc9f 100644
--- a/usr/src/lib/libc/inc/synonyms.h
+++ b/usr/src/lib/libc/inc/synonyms.h
@@ -223,6 +223,10 @@ extern "C" {
#define chown _chown
#define chroot _chroot
#define _cladm __cladm
+#define clock_getres _clock_getres
+#define clock_gettime _clock_gettime
+#define clock_nanosleep _clock_nanosleep
+#define clock_settime _clock_settime
#define close _close
#define closedir _closedir
#define closefrom _closefrom
@@ -264,8 +268,8 @@ extern "C" {
#define decimal_to_single _decimal_to_single
#define dgettext _dgettext
#define dirname _dirname
-#define dladdr _dladdr
#define dladdr1 _dladdr1
+#define dladdr _dladdr
#define dlamd64getunwind _dlamd64getunwind
#define dlclose _dlclose
#define dldump _dldump
@@ -495,7 +499,6 @@ extern "C" {
#define iswupper _iswupper
#define iswxdigit _iswxdigit
#define jrand48 _jrand48
-#define kaio _kaio
#define kill _kill
#define l64a _l64a
#define ladd _ladd
@@ -590,12 +593,19 @@ extern "C" {
#define munlockall _munlockall
#define munlock _munlock
#define munmap _munmap
-#define mutex_destroy _mutex_destroy
-#define mutex_held _mutex_held
-#define mutex_init _mutex_init
-#define mutex_lock _mutex_lock
-#define mutex_trylock _mutex_trylock
-#define mutex_unlock _mutex_unlock
+#define _mutex_destroy __mutex_destroy
+#define mutex_destroy __mutex_destroy
+#define _mutex_held __mutex_held
+#define mutex_held __mutex_held
+#define _mutex_init __mutex_init
+#define mutex_init __mutex_init
+#define _mutex_lock __mutex_lock
+#define mutex_lock __mutex_lock
+#define _mutex_trylock __mutex_trylock
+#define mutex_trylock __mutex_trylock
+#define _mutex_unlock __mutex_unlock
+#define mutex_unlock __mutex_unlock
+#define nanosleep _nanosleep
#define nfs_getfh _nfs_getfh
#define nfssvc _nfssvc
#define nftw64 _nftw64
@@ -627,7 +637,6 @@ extern "C" {
#define port_alert _port_alert
#define port_associate _port_associate
#define port_create _port_create
-#define port_dispatch _port_dispatch
#define port_dissociate _port_dissociate
#define port_getn _port_getn
#define port_get _port_get
@@ -865,12 +874,23 @@ extern "C" {
#define sema_timedwait _sema_timedwait
#define sema_trywait _sema_trywait
#define sema_wait _sema_wait
+#define sem_close _sem_close
#define semctl64 _semctl64
#define semctl _semctl
+#define sem_destroy _sem_destroy
#define semget _semget
+#define sem_getvalue _sem_getvalue
#define semids _semids
+#define sem_init _sem_init
+#define sem_open _sem_open
#define semop _semop
+#define sem_post _sem_post
+#define sem_reltimedwait_np _sem_reltimedwait_np
#define semtimedop _semtimedop
+#define sem_timedwait _sem_timedwait
+#define sem_trywait _sem_trywait
+#define sem_unlink _sem_unlink
+#define sem_wait _sem_wait
#define setcontext _setcontext
#define setegid _setegid
#define setenv _setenv
@@ -927,12 +947,16 @@ extern "C" {
#define sigpause _sigpause
#define sigpending _sigpending
#define sigprocmask _sigprocmask
+#define sigqueue _sigqueue
#define sigrelse _sigrelse
#define sigsendset _sigsendset
#define sigsend _sigsend
#define sigsetjmp _sigsetjmp
#define sigset _sigset
+#define sigstack _sigstack
#define sigsuspend _sigsuspend
+#define sigtimedwait _sigtimedwait
+#define sigwaitinfo _sigwaitinfo
#define sigwait _sigwait
#define single_to_decimal _single_to_decimal
#define s_ioctl _s_ioctl
@@ -1018,6 +1042,11 @@ extern "C" {
#define thr_suspend _thr_suspend
#define thr_wait_mutator _thr_wait_mutator
#define thr_yield _thr_yield
+#define timer_create _timer_create
+#define timer_delete _timer_delete
+#define timer_getoverrun _timer_getoverrun
+#define timer_gettime _timer_gettime
+#define timer_settime _timer_settime
#define times _times
#define time _time
#define tmpnam_r _tmpnam_r
diff --git a/usr/src/lib/libc/inc/thr_debug.h b/usr/src/lib/libc/inc/thr_debug.h
new file mode 100644
index 0000000000..5e8de4ef0a
--- /dev/null
+++ b/usr/src/lib/libc/inc/thr_debug.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _THR_DEBUG_H
+#define _THR_DEBUG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#if defined(THREAD_DEBUG)
+
+extern void __assfail(const char *, const char *, int);
+#pragma rarely_called(__assfail)
+#define ASSERT(EX) (void)((EX) || (__assfail(#EX, __FILE__, __LINE__), 0))
+
+#else
+
+#define ASSERT(EX) ((void)0)
+
+#endif
+
+#endif /* _THR_DEBUG_H */
diff --git a/usr/src/lib/libc/inc/thr_uberdata.h b/usr/src/lib/libc/inc/thr_uberdata.h
index c7b6001926..2671ac0a69 100644
--- a/usr/src/lib/libc/inc/thr_uberdata.h
+++ b/usr/src/lib/libc/inc/thr_uberdata.h
@@ -53,12 +53,10 @@
#include <schedctl.h>
#include <sys/priocntl.h>
#include <thread_db.h>
+#include <setjmp.h>
#include "libc_int.h"
#include "tdb_agent.h"
-
-/* belongs in <pthread.h> */
-#define PTHREAD_CREATE_DAEMON_NP 0x100 /* = THR_DAEMON */
-#define PTHREAD_CREATE_NONDAEMON_NP 0
+#include "thr_debug.h"
/*
* This is an implementation-specific include file for threading support.
@@ -208,14 +206,6 @@ typedef union {
#define PRIO_INHERIT 2
#define PRIO_DISINHERIT 3
-struct pcclass {
- short pcc_state;
- pri_t pcc_primin;
- pri_t pcc_primax;
- pcinfo_t pcc_info;
-};
-extern struct pcclass ts_class, rt_class;
-
#define MUTEX_TRY 0
#define MUTEX_LOCK 1
@@ -608,7 +598,7 @@ typedef struct ulwp {
#define MASKSET0 (FILLSET0 & ~CANTMASK0)
#define MASKSET1 (FILLSET1 & ~CANTMASK1)
-extern const sigset_t maskset; /* set of all maskable signals */
+extern const sigset_t maskset; /* set of all maskable signals */
extern int thread_adaptive_spin;
extern uint_t thread_max_spinners;
@@ -1048,7 +1038,7 @@ extern greg_t stkptr(void);
/*
* Implementation functions. Not visible outside of the library itself.
*/
-extern int ___nanosleep(const timespec_t *, timespec_t *);
+extern int __nanosleep(const timespec_t *, timespec_t *);
extern void getgregs(ulwp_t *, gregset_t);
extern void setgregs(ulwp_t *, gregset_t);
extern void thr_panic(const char *);
@@ -1092,18 +1082,6 @@ extern void _flush_windows(void);
#endif
extern void set_curthread(void *);
-#if defined(THREAD_DEBUG)
-
-extern void __assfail(const char *, const char *, int);
-#pragma rarely_called(__assfail)
-#define ASSERT(EX) (void)((EX) || (__assfail(#EX, __FILE__, __LINE__), 0))
-
-#else /* THREAD_DEBUG */
-
-#define ASSERT(EX) ((void)0)
-
-#endif /* THREAD_DEBUG */
-
/* enter a critical section */
#define enter_critical(self) (self->ul_critical++)
@@ -1174,21 +1152,35 @@ extern void *_thr_setup(ulwp_t *);
extern void _fpinherit(ulwp_t *);
extern void _lwp_start(void);
extern void _lwp_terminate(void);
-extern void lmutex_unlock(mutex_t *);
extern void lmutex_lock(mutex_t *);
+extern void lmutex_unlock(mutex_t *);
+extern void sig_mutex_lock(mutex_t *);
+extern void sig_mutex_unlock(mutex_t *);
+extern int sig_mutex_trylock(mutex_t *);
+extern int sig_cond_wait(cond_t *, mutex_t *);
+extern int sig_cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *);
extern void _prefork_handler(void);
extern void _postfork_parent_handler(void);
extern void _postfork_child_handler(void);
-extern void _postfork1_child(void);
+extern void postfork1_child(void);
+extern void postfork1_child_aio(void);
+extern void postfork1_child_sigev_aio(void);
+extern void postfork1_child_sigev_mq(void);
+extern void postfork1_child_sigev_timer(void);
+extern void postfork1_child_tpool(void);
extern int fork_lock_enter(const char *);
extern void fork_lock_exit(void);
extern void suspend_fork(void);
extern void continue_fork(int);
extern void do_sigcancel(void);
-extern void init_sigcancel(void);
+extern void setup_cancelsig(int);
+extern void init_sigev_thread(void);
+extern void init_aio(void);
extern void _cancelon(void);
extern void _canceloff(void);
extern void _canceloff_nocancel(void);
+extern void _cancel_prologue(void);
+extern void _cancel_epilogue(void);
extern void no_preempt(ulwp_t *);
extern void preempt(ulwp_t *);
extern void _thrp_unwind(void *);
@@ -1249,8 +1241,18 @@ extern int __lwp_sigmask(int, const sigset_t *, sigset_t *);
extern void __sighndlr(int, siginfo_t *, ucontext_t *, void (*)());
extern caddr_t __sighndlrend;
#pragma unknown_control_flow(__sighndlr)
+extern void _siglongjmp(sigjmp_buf, int);
+extern int _pthread_setspecific(pthread_key_t, const void *);
+extern void *_pthread_getspecific(pthread_key_t);
extern void _pthread_exit(void *);
+extern void _private_testcancel(void);
+
+/* belongs in <pthread.h> */
+#define PTHREAD_CREATE_DAEMON_NP 0x100 /* = THR_DAEMON */
+#define PTHREAD_CREATE_NONDAEMON_NP 0
+extern int _pthread_attr_setdaemonstate_np(pthread_attr_t *, int);
+extern int _pthread_attr_getdaemonstate_np(const pthread_attr_t *, int *);
/* these are private to the library */
extern int _private_mutex_init(mutex_t *, int, void *);
@@ -1293,8 +1295,10 @@ extern int rw_read_is_held(rwlock_t *);
extern int rw_write_is_held(rwlock_t *);
extern int _thr_continue(thread_t);
-extern int _thrp_create(void *, size_t, void *(*func)(void *), void *,
- long, thread_t *, pri_t, int, size_t);
+extern int _thr_create(void *, size_t, void *(*)(void *), void *, long,
+ thread_t *);
+extern int _thrp_create(void *, size_t, void *(*)(void *), void *, long,
+ thread_t *, pri_t, int, size_t);
extern int _thr_getprio(thread_t, int *);
extern int _thr_getspecific(thread_key_t, void **);
extern int _thr_join(thread_t, thread_t *, void **);
@@ -1320,7 +1324,8 @@ extern int _thread_setschedparam_main(pthread_t, int,
const struct sched_param *, int);
extern int _validate_rt_prio(int, int);
extern int _thrp_setlwpprio(lwpid_t, int, int);
-extern pri_t _map_rtpri_to_gp(pri_t);
+extern pri_t map_rtpri_to_gp(pri_t);
+extern int get_info_by_policy(int);
/*
* System call wrappers (direct interfaces to the kernel)
diff --git a/usr/src/lib/libc/inc/thread_pool.h b/usr/src/lib/libc/inc/thread_pool.h
new file mode 100644
index 0000000000..200323703c
--- /dev/null
+++ b/usr/src/lib/libc/inc/thread_pool.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _THREAD_POOL_H_
+#define _THREAD_POOL_H_
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <thread.h>
+#include <pthread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct tpool tpool_t; /* opaque thread pool descriptor */
+
+#if defined(__STDC__)
+
+extern tpool_t *tpool_create(uint_t min_threads, uint_t max_threads,
+ uint_t linger, pthread_attr_t *attr);
+extern int tpool_dispatch(tpool_t *tpool,
+ void (*func)(void *), void *arg);
+extern void tpool_destroy(tpool_t *tpool);
+extern void tpool_abandon(tpool_t *tpool);
+extern void tpool_wait(tpool_t *tpool);
+extern void tpool_suspend(tpool_t *tpool);
+extern int tpool_suspended(tpool_t *tpool);
+extern void tpool_resume(tpool_t *tpool);
+extern int tpool_member(tpool_t *tpool);
+
+#else /* Non ANSI */
+
+extern tpool_t *tpool_create();
+extern int tpool_dispatch();
+extern void tpool_destroy();
+extern void tpool_abandon();
+extern void tpool_wait();
+extern void tpool_suspend();
+extern int tpool_suspended();
+extern void tpool_resume();
+extern int tpool_member();
+
+#endif /* __STDC__ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _THREAD_POOL_H_ */
diff --git a/usr/src/lib/libc/port/aio/aio.c b/usr/src/lib/libc/port/aio/aio.c
new file mode 100644
index 0000000000..28d425d702
--- /dev/null
+++ b/usr/src/lib/libc/port/aio/aio.c
@@ -0,0 +1,2202 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include "thr_uberdata.h"
+#include "asyncio.h"
+#include <atomic.h>
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/port.h>
+
+static int _aio_hash_insert(aio_result_t *, aio_req_t *);
+static aio_req_t *_aio_req_get(aio_worker_t *);
+static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
+static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
+static void _aio_work_done(aio_worker_t *);
+static void _aio_enq_doneq(aio_req_t *);
+
+extern void _aio_lio_free(aio_lio_t *);
+
+extern int __fdsync(int, int);
+extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
+
+static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
+static void _aiodone(aio_req_t *, ssize_t, int);
+static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
+static void _aio_finish_request(aio_worker_t *, ssize_t, int);
+
+/*
+ * switch for kernel async I/O
+ */
+int _kaio_ok = 0; /* 0 = disabled, 1 = on, -1 = error */
+
+/*
+ * Key for thread-specific data
+ */
+pthread_key_t _aio_key;
+
+/*
+ * Array for determining whether or not a file supports kaio.
+ * Initialized in _kaio_init().
+ */
+uint32_t *_kaio_supported = NULL;
+
+/*
+ * workers for read/write requests
+ * (__aio_mutex lock protects circular linked list of workers)
+ */
+aio_worker_t *__workers_rw; /* circular list of AIO workers */
+aio_worker_t *__nextworker_rw; /* next worker in list of workers */
+int __rw_workerscnt; /* number of read/write workers */
+
+/*
+ * worker for notification requests.
+ */
+aio_worker_t *__workers_no; /* circular list of AIO workers */
+aio_worker_t *__nextworker_no; /* next worker in list of workers */
+int __no_workerscnt; /* number of write workers */
+
+aio_req_t *_aio_done_tail; /* list of done requests */
+aio_req_t *_aio_done_head;
+
+mutex_t __aio_initlock = DEFAULTMUTEX; /* makes aio initialization atomic */
+cond_t __aio_initcv = DEFAULTCV;
+int __aio_initbusy = 0;
+
+mutex_t __aio_mutex = DEFAULTMUTEX; /* protects counts, and linked lists */
+cond_t _aio_iowait_cv = DEFAULTCV; /* wait for userland I/Os */
+
+pid_t __pid = (pid_t)-1; /* initialize as invalid pid */
+int _sigio_enabled = 0; /* when set, send SIGIO signal */
+
+aio_hash_t *_aio_hash;
+
+aio_req_t *_aio_doneq; /* double linked done queue list */
+
+int _aio_donecnt = 0;
+int _aio_waitncnt = 0; /* # of requests for aio_waitn */
+int _aio_doneq_cnt = 0;
+int _aio_outstand_cnt = 0; /* # of outstanding requests */
+int _kaio_outstand_cnt = 0; /* # of outstanding kaio requests */
+int _aio_req_done_cnt = 0; /* req. done but not in "done queue" */
+int _aio_kernel_suspend = 0; /* active kernel kaio calls */
+int _aio_suscv_cnt = 0; /* aio_suspend calls waiting on cv's */
+
+int _max_workers = 256; /* max number of workers permitted */
+int _min_workers = 4; /* min number of workers */
+int _minworkload = 2; /* min number of request in q */
+int _aio_worker_cnt = 0; /* number of workers to do requests */
+int __uaio_ok = 0; /* AIO has been enabled */
+sigset_t _worker_set; /* worker's signal mask */
+
+int _aiowait_flag = 0; /* when set, aiowait() is inprogress */
+int _aio_flags = 0; /* see asyncio.h defines for */
+
+aio_worker_t *_kaiowp = NULL; /* points to kaio cleanup thread */
+
+int hz; /* clock ticks per second */
+
+static int
+_kaio_supported_init(void)
+{
+ void *ptr;
+ size_t size;
+
+ if (_kaio_supported != NULL) /* already initialized */
+ return (0);
+
+ size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
+ if (ptr == MAP_FAILED)
+ return (-1);
+ _kaio_supported = ptr;
+ return (0);
+}
+
+/*
+ * The aio subsystem is initialized when an AIO request is made.
+ * Constants are initialized like the max number of workers that
+ * the subsystem can create, and the minimum number of workers
+ * permitted before imposing some restrictions. Also, some
+ * workers are created.
+ */
+int
+__uaio_init(void)
+{
+ int ret = -1;
+ int i;
+
+ lmutex_lock(&__aio_initlock);
+ while (__aio_initbusy)
+ (void) _cond_wait(&__aio_initcv, &__aio_initlock);
+ if (__uaio_ok) { /* already initialized */
+ lmutex_unlock(&__aio_initlock);
+ return (0);
+ }
+ __aio_initbusy = 1;
+ lmutex_unlock(&__aio_initlock);
+
+ hz = (int)sysconf(_SC_CLK_TCK);
+ __pid = getpid();
+
+ setup_cancelsig(SIGAIOCANCEL);
+
+ if (_kaio_supported_init() != 0)
+ goto out;
+
+ /*
+ * Allocate and initialize the hash table.
+ */
+ /* LINTED pointer cast */
+ _aio_hash = (aio_hash_t *)mmap(NULL,
+ HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
+ if ((void *)_aio_hash == MAP_FAILED) {
+ _aio_hash = NULL;
+ goto out;
+ }
+ for (i = 0; i < HASHSZ; i++)
+ (void) mutex_init(&_aio_hash[i].hash_lock, USYNC_THREAD, NULL);
+
+ /*
+ * Initialize worker's signal mask to only catch SIGAIOCANCEL.
+ */
+ (void) sigfillset(&_worker_set);
+ (void) sigdelset(&_worker_set, SIGAIOCANCEL);
+
+ /*
+ * Create the minimum number of read/write workers.
+ */
+ for (i = 0; i < _min_workers; i++)
+ (void) _aio_create_worker(NULL, AIOREAD);
+
+ /*
+ * Create one worker to send asynchronous notifications.
+ */
+ (void) _aio_create_worker(NULL, AIONOTIFY);
+
+ ret = 0;
+out:
+ lmutex_lock(&__aio_initlock);
+ if (ret == 0)
+ __uaio_ok = 1;
+ __aio_initbusy = 0;
+ (void) cond_broadcast(&__aio_initcv);
+ lmutex_unlock(&__aio_initlock);
+ return (ret);
+}
+
+/*
+ * Called from close() before actually performing the real _close().
+ */
+void
+_aio_close(int fd)
+{
+ if (fd < 0) /* avoid cancelling everything */
+ return;
+ /*
+ * Cancel all outstanding aio requests for this file descriptor.
+ */
+ if (__uaio_ok)
+ (void) aiocancel_all(fd);
+ /*
+ * If we have allocated the bit array, clear the bit for this file.
+ * The next open may re-use this file descriptor and the new file
+ * may have different kaio() behaviour.
+ */
+ if (_kaio_supported != NULL)
+ CLEAR_KAIO_SUPPORTED(fd);
+}
+
+/*
+ * special kaio cleanup thread sits in a loop in the
+ * kernel waiting for pending kaio requests to complete.
+ */
+void *
+_kaio_cleanup_thread(void *arg)
+{
+ if (pthread_setspecific(_aio_key, arg) != 0)
+ aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
+ (void) _kaio(AIOSTART);
+ return (arg);
+}
+
+/*
+ * initialize kaio.
+ */
+void
+_kaio_init()
+{
+ int error;
+ sigset_t oset;
+
+ lmutex_lock(&__aio_initlock);
+ while (__aio_initbusy)
+ (void) _cond_wait(&__aio_initcv, &__aio_initlock);
+ if (_kaio_ok) { /* already initialized */
+ lmutex_unlock(&__aio_initlock);
+ return;
+ }
+ __aio_initbusy = 1;
+ lmutex_unlock(&__aio_initlock);
+
+ if (_kaio_supported_init() != 0)
+ error = ENOMEM;
+ else if ((_kaiowp = _aio_worker_alloc()) == NULL)
+ error = ENOMEM;
+ else if ((error = (int)_kaio(AIOINIT)) == 0) {
+ (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
+ error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
+ _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
+ (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
+ }
+ if (error && _kaiowp != NULL) {
+ _aio_worker_free(_kaiowp);
+ _kaiowp = NULL;
+ }
+
+ lmutex_lock(&__aio_initlock);
+ if (error)
+ _kaio_ok = -1;
+ else
+ _kaio_ok = 1;
+ __aio_initbusy = 0;
+ (void) cond_broadcast(&__aio_initcv);
+ lmutex_unlock(&__aio_initlock);
+}
+
+int
+aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
+}
+
+int
+aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
+}
+
+#if !defined(_LP64)
+int
+aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
+}
+
+int
+aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
+}
+#endif /* !defined(_LP64) */
+
+int
+_aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
+ aio_result_t *resultp, int mode)
+{
+ aio_req_t *reqp;
+ aio_args_t *ap;
+ offset_t loffset;
+ struct stat stat;
+ int error = 0;
+ int kerr;
+ int umode;
+
+ switch (whence) {
+
+ case SEEK_SET:
+ loffset = offset;
+ break;
+ case SEEK_CUR:
+ if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
+ error = -1;
+ else
+ loffset += offset;
+ break;
+ case SEEK_END:
+ if (fstat(fd, &stat) == -1)
+ error = -1;
+ else
+ loffset = offset + stat.st_size;
+ break;
+ default:
+ errno = EINVAL;
+ error = -1;
+ }
+
+ if (error)
+ return (error);
+
+ /* initialize kaio */
+ if (!_kaio_ok)
+ _kaio_init();
+
+ /*
+ * _aio_do_request() needs the original request code (mode) to be able
+ * to choose the appropiate 32/64 bit function. All other functions
+ * only require the difference between READ and WRITE (umode).
+ */
+ if (mode == AIOAREAD64 || mode == AIOAWRITE64)
+ umode = mode - AIOAREAD64;
+ else
+ umode = mode;
+
+ /*
+ * Try kernel aio first.
+ * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
+ */
+ if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
+ resultp->aio_errno = 0;
+ sig_mutex_lock(&__aio_mutex);
+ _kaio_outstand_cnt++;
+ kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
+ (umode | AIO_POLL_BIT) : umode),
+ fd, buf, bufsz, loffset, resultp);
+ if (kerr == 0) {
+ sig_mutex_unlock(&__aio_mutex);
+ return (0);
+ }
+ _kaio_outstand_cnt--;
+ sig_mutex_unlock(&__aio_mutex);
+ if (errno != ENOTSUP && errno != EBADFD)
+ return (-1);
+ if (errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(fd);
+ }
+
+ if (!__uaio_ok && __uaio_init() == -1)
+ return (-1);
+
+ if ((reqp = _aio_req_alloc()) == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * _aio_do_request() checks reqp->req_op to differentiate
+ * between 32 and 64 bit access.
+ */
+ reqp->req_op = mode;
+ reqp->req_resultp = resultp;
+ ap = &reqp->req_args;
+ ap->fd = fd;
+ ap->buf = buf;
+ ap->bufsz = bufsz;
+ ap->offset = loffset;
+
+ if (_aio_hash_insert(resultp, reqp) != 0) {
+ _aio_req_free(reqp);
+ errno = EINVAL;
+ return (-1);
+ }
+ /*
+ * _aio_req_add() only needs the difference between READ and
+ * WRITE to choose the right worker queue.
+ */
+ _aio_req_add(reqp, &__nextworker_rw, umode);
+ return (0);
+}
+
+int
+aiocancel(aio_result_t *resultp)
+{
+ aio_req_t *reqp;
+ aio_worker_t *aiowp;
+ int ret;
+ int done = 0;
+ int canceled = 0;
+
+ if (!__uaio_ok) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ sig_mutex_lock(&__aio_mutex);
+ reqp = _aio_hash_find(resultp);
+ if (reqp == NULL) {
+ if (_aio_outstand_cnt == _aio_req_done_cnt)
+ errno = EINVAL;
+ else
+ errno = EACCES;
+ ret = -1;
+ } else {
+ aiowp = reqp->req_worker;
+ sig_mutex_lock(&aiowp->work_qlock1);
+ (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+
+ if (canceled) {
+ ret = 0;
+ } else {
+ if (_aio_outstand_cnt == 0 ||
+ _aio_outstand_cnt == _aio_req_done_cnt)
+ errno = EINVAL;
+ else
+ errno = EACCES;
+ ret = -1;
+ }
+ }
+ sig_mutex_unlock(&__aio_mutex);
+ return (ret);
+}
+
+/*
+ * This must be asynch safe
+ */
+aio_result_t *
+aiowait(struct timeval *uwait)
+{
+ aio_result_t *uresultp;
+ aio_result_t *kresultp;
+ aio_result_t *resultp;
+ int dontblock;
+ int timedwait = 0;
+ int kaio_errno = 0;
+ struct timeval twait;
+ struct timeval *wait = NULL;
+ hrtime_t hrtend;
+ hrtime_t hres;
+
+ if (uwait) {
+ /*
+ * Check for a valid specified wait time.
+ * If it is invalid, fail the call right away.
+ */
+ if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
+ uwait->tv_usec >= MICROSEC) {
+ errno = EINVAL;
+ return ((aio_result_t *)-1);
+ }
+
+ if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
+ hrtend = gethrtime() +
+ (hrtime_t)uwait->tv_sec * NANOSEC +
+ (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
+ twait = *uwait;
+ wait = &twait;
+ timedwait++;
+ } else {
+ /* polling */
+ sig_mutex_lock(&__aio_mutex);
+ if (_kaio_outstand_cnt == 0) {
+ kresultp = (aio_result_t *)-1;
+ } else {
+ kresultp = (aio_result_t *)_kaio(AIOWAIT,
+ (struct timeval *)-1, 1);
+ if (kresultp != (aio_result_t *)-1 &&
+ kresultp != NULL &&
+ kresultp != (aio_result_t *)1) {
+ _kaio_outstand_cnt--;
+ sig_mutex_unlock(&__aio_mutex);
+ return (kresultp);
+ }
+ }
+ uresultp = _aio_req_done();
+ sig_mutex_unlock(&__aio_mutex);
+ if (uresultp != NULL &&
+ uresultp != (aio_result_t *)-1) {
+ return (uresultp);
+ }
+ if (uresultp == (aio_result_t *)-1 &&
+ kresultp == (aio_result_t *)-1) {
+ errno = EINVAL;
+ return ((aio_result_t *)-1);
+ } else {
+ return (NULL);
+ }
+ }
+ }
+
+ for (;;) {
+ sig_mutex_lock(&__aio_mutex);
+ uresultp = _aio_req_done();
+ if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
+ sig_mutex_unlock(&__aio_mutex);
+ resultp = uresultp;
+ break;
+ }
+ _aiowait_flag++;
+ dontblock = (uresultp == (aio_result_t *)-1);
+ if (dontblock && _kaio_outstand_cnt == 0) {
+ kresultp = (aio_result_t *)-1;
+ kaio_errno = EINVAL;
+ } else {
+ sig_mutex_unlock(&__aio_mutex);
+ kresultp = (aio_result_t *)_kaio(AIOWAIT,
+ wait, dontblock);
+ sig_mutex_lock(&__aio_mutex);
+ kaio_errno = errno;
+ }
+ _aiowait_flag--;
+ sig_mutex_unlock(&__aio_mutex);
+ if (kresultp == (aio_result_t *)1) {
+ /* aiowait() awakened by an aionotify() */
+ continue;
+ } else if (kresultp != NULL &&
+ kresultp != (aio_result_t *)-1) {
+ resultp = kresultp;
+ sig_mutex_lock(&__aio_mutex);
+ _kaio_outstand_cnt--;
+ sig_mutex_unlock(&__aio_mutex);
+ break;
+ } else if (kresultp == (aio_result_t *)-1 &&
+ kaio_errno == EINVAL &&
+ uresultp == (aio_result_t *)-1) {
+ errno = kaio_errno;
+ resultp = (aio_result_t *)-1;
+ break;
+ } else if (kresultp == (aio_result_t *)-1 &&
+ kaio_errno == EINTR) {
+ errno = kaio_errno;
+ resultp = (aio_result_t *)-1;
+ break;
+ } else if (timedwait) {
+ hres = hrtend - gethrtime();
+ if (hres <= 0) {
+ /* time is up; return */
+ resultp = NULL;
+ break;
+ } else {
+ /*
+ * Some time left. Round up the remaining time
+ * in nanoseconds to microsec. Retry the call.
+ */
+ hres += (NANOSEC / MICROSEC) - 1;
+ wait->tv_sec = hres / NANOSEC;
+ wait->tv_usec =
+ (hres % NANOSEC) / (NANOSEC / MICROSEC);
+ }
+ } else {
+ ASSERT(kresultp == NULL && uresultp == NULL);
+ resultp = NULL;
+ continue;
+ }
+ }
+ return (resultp);
+}
+
+/*
+ * _aio_get_timedelta calculates the remaining time and stores the result
+ * into timespec_t *wait.
+ */
+
+int
+_aio_get_timedelta(timespec_t *end, timespec_t *wait)
+{
+ int ret = 0;
+ struct timeval cur;
+ timespec_t curtime;
+
+ (void) gettimeofday(&cur, NULL);
+ curtime.tv_sec = cur.tv_sec;
+ curtime.tv_nsec = cur.tv_usec * 1000; /* convert us to ns */
+
+ if (end->tv_sec >= curtime.tv_sec) {
+ wait->tv_sec = end->tv_sec - curtime.tv_sec;
+ if (end->tv_nsec >= curtime.tv_nsec) {
+ wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
+ if (wait->tv_sec == 0 && wait->tv_nsec == 0)
+ ret = -1; /* timer expired */
+ } else {
+ if (end->tv_sec > curtime.tv_sec) {
+ wait->tv_sec -= 1;
+ wait->tv_nsec = NANOSEC -
+ (curtime.tv_nsec - end->tv_nsec);
+ } else {
+ ret = -1; /* timer expired */
+ }
+ }
+ } else {
+ ret = -1;
+ }
+ return (ret);
+}
+
+/*
+ * If closing by file descriptor: we will simply cancel all the outstanding
+ * aio`s and return. Those aio's in question will have either noticed the
+ * cancellation notice before, during, or after initiating io.
+ */
+int
+aiocancel_all(int fd)
+{
+ aio_req_t *reqp;
+ aio_req_t **reqpp;
+ aio_worker_t *first;
+ aio_worker_t *next;
+ int canceled = 0;
+ int done = 0;
+ int cancelall = 0;
+
+ sig_mutex_lock(&__aio_mutex);
+
+ if (_aio_outstand_cnt == 0) {
+ sig_mutex_unlock(&__aio_mutex);
+ return (AIO_ALLDONE);
+ }
+
+ /*
+ * Cancel requests from the read/write workers' queues.
+ */
+ first = __nextworker_rw;
+ next = first;
+ do {
+ _aio_cancel_work(next, fd, &canceled, &done);
+ } while ((next = next->work_forw) != first);
+
+ /*
+ * finally, check if there are requests on the done queue that
+ * should be canceled.
+ */
+ if (fd < 0)
+ cancelall = 1;
+ reqpp = &_aio_done_tail;
+ while ((reqp = *reqpp) != NULL) {
+ if (cancelall || reqp->req_args.fd == fd) {
+ *reqpp = reqp->req_next;
+ _aio_donecnt--;
+ (void) _aio_hash_del(reqp->req_resultp);
+ _aio_req_free(reqp);
+ } else
+ reqpp = &reqp->req_next;
+ }
+ if (cancelall) {
+ ASSERT(_aio_donecnt == 0);
+ _aio_done_head = NULL;
+ }
+ sig_mutex_unlock(&__aio_mutex);
+
+ if (canceled && done == 0)
+ return (AIO_CANCELED);
+ else if (done && canceled == 0)
+ return (AIO_ALLDONE);
+ else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
+ return ((int)_kaio(AIOCANCEL, fd, NULL));
+ return (AIO_NOTCANCELED);
+}
+
+/*
+ * Cancel requests from a given work queue. If the file descriptor
+ * parameter, fd, is non-negative, then only cancel those requests
+ * in this queue that are to this file descriptor. If the fd
+ * parameter is -1, then cancel all requests.
+ */
+static void
+_aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
+{
+ aio_req_t *reqp;
+
+ sig_mutex_lock(&aiowp->work_qlock1);
+ /*
+ * cancel queued requests first.
+ */
+ reqp = aiowp->work_tail1;
+ while (reqp != NULL) {
+ if (fd < 0 || reqp->req_args.fd == fd) {
+ if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
+ /*
+ * Callers locks were dropped.
+ * reqp is invalid; start traversing
+ * the list from the beginning again.
+ */
+ reqp = aiowp->work_tail1;
+ continue;
+ }
+ }
+ reqp = reqp->req_next;
+ }
+ /*
+ * Since the queued requests have been canceled, there can
+ * only be one inprogress request that should be canceled.
+ */
+ if ((reqp = aiowp->work_req) != NULL &&
+ (fd < 0 || reqp->req_args.fd == fd))
+ (void) _aio_cancel_req(aiowp, reqp, canceled, done);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+}
+
+/*
+ * Cancel a request. Return 1 if the callers locks were temporarily
+ * dropped, otherwise return 0.
+ */
+int
+_aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
+{
+ int ostate = reqp->req_state;
+
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
+ if (ostate == AIO_REQ_CANCELED)
+ return (0);
+ if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
+ (*done)++;
+ return (0);
+ }
+ if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
+ ASSERT(POSIX_AIO(reqp));
+ /* Cancel the queued aio_fsync() request */
+ if (!reqp->req_head->lio_canned) {
+ reqp->req_head->lio_canned = 1;
+ _aio_outstand_cnt--;
+ (*canceled)++;
+ }
+ return (0);
+ }
+ reqp->req_state = AIO_REQ_CANCELED;
+ _aio_req_del(aiowp, reqp, ostate);
+ (void) _aio_hash_del(reqp->req_resultp);
+ (*canceled)++;
+ if (reqp == aiowp->work_req) {
+ ASSERT(ostate == AIO_REQ_INPROGRESS);
+ /*
+ * Set the result values now, before _aiodone() is called.
+ * We do this because the application can expect aio_return
+ * and aio_errno to be set to -1 and ECANCELED, respectively,
+ * immediately after a successful return from aiocancel()
+ * or aio_cancel().
+ */
+ _aio_set_result(reqp, -1, ECANCELED);
+ (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
+ return (0);
+ }
+ if (!POSIX_AIO(reqp)) {
+ _aio_outstand_cnt--;
+ _aio_set_result(reqp, -1, ECANCELED);
+ return (0);
+ }
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_unlock(&__aio_mutex);
+ _aiodone(reqp, -1, ECANCELED);
+ sig_mutex_lock(&__aio_mutex);
+ sig_mutex_lock(&aiowp->work_qlock1);
+ return (1);
+}
+
+int
+_aio_create_worker(aio_req_t *reqp, int mode)
+{
+ aio_worker_t *aiowp, **workers, **nextworker;
+ int *aio_workerscnt;
+ void *(*func)(void *);
+ sigset_t oset;
+ int error;
+
+ /*
+ * Put the new worker thread in the right queue.
+ */
+ switch (mode) {
+ case AIOREAD:
+ case AIOWRITE:
+ case AIOAREAD:
+ case AIOAWRITE:
+#if !defined(_LP64)
+ case AIOAREAD64:
+ case AIOAWRITE64:
+#endif
+ workers = &__workers_rw;
+ nextworker = &__nextworker_rw;
+ aio_workerscnt = &__rw_workerscnt;
+ func = _aio_do_request;
+ break;
+ case AIONOTIFY:
+ workers = &__workers_no;
+ nextworker = &__nextworker_no;
+ func = _aio_do_notify;
+ aio_workerscnt = &__no_workerscnt;
+ break;
+ default:
+ aio_panic("_aio_create_worker: invalid mode");
+ break;
+ }
+
+ if ((aiowp = _aio_worker_alloc()) == NULL)
+ return (-1);
+
+ if (reqp) {
+ reqp->req_state = AIO_REQ_QUEUED;
+ reqp->req_worker = aiowp;
+ aiowp->work_head1 = reqp;
+ aiowp->work_tail1 = reqp;
+ aiowp->work_next1 = reqp;
+ aiowp->work_count1 = 1;
+ aiowp->work_minload1 = 1;
+ }
+
+ (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
+ error = thr_create(NULL, AIOSTKSIZE, func, aiowp,
+ THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
+ (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
+ if (error) {
+ if (reqp) {
+ reqp->req_state = 0;
+ reqp->req_worker = NULL;
+ }
+ _aio_worker_free(aiowp);
+ return (-1);
+ }
+
+ lmutex_lock(&__aio_mutex);
+ (*aio_workerscnt)++;
+ if (*workers == NULL) {
+ aiowp->work_forw = aiowp;
+ aiowp->work_backw = aiowp;
+ *nextworker = aiowp;
+ *workers = aiowp;
+ } else {
+ aiowp->work_backw = (*workers)->work_backw;
+ aiowp->work_forw = (*workers);
+ (*workers)->work_backw->work_forw = aiowp;
+ (*workers)->work_backw = aiowp;
+ }
+ _aio_worker_cnt++;
+ lmutex_unlock(&__aio_mutex);
+
+ (void) thr_continue(aiowp->work_tid);
+
+ return (0);
+}
+
+/*
+ * This is the worker's main routine.
+ * The task of this function is to execute all queued requests;
+ * once the last pending request is executed this function will block
+ * in _aio_idle(). A new incoming request must wakeup this thread to
+ * restart the work.
+ * Every worker has an own work queue. The queue lock is required
+ * to synchronize the addition of new requests for this worker or
+ * cancellation of pending/running requests.
+ *
+ * Cancellation scenarios:
+ * The cancellation of a request is being done asynchronously using
+ * _aio_cancel_req() from another thread context.
+ * A queued request can be cancelled in different manners :
+ * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
+ * - lock the queue -> remove the request -> unlock the queue
+ * - this function/thread does not detect this cancellation process
+ * b) request is in progress (AIO_REQ_INPROGRESS) :
+ * - this function first allow the cancellation of the running
+ * request with the flag "work_cancel_flg=1"
+ * see _aio_req_get() -> _aio_cancel_on()
+ * During this phase, it is allowed to interrupt the worker
+ * thread running the request (this thread) using the SIGAIOCANCEL
+ * signal.
+ * Once this thread returns from the kernel (because the request
+ * is just done), then it must disable a possible cancellation
+ * and proceed to finish the request. To disable the cancellation
+ * this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
+ * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
+ * same procedure as in a)
+ *
+ * To b)
+ * This thread uses sigsetjmp() to define the position in the code, where
+ * it wish to continue working in the case that a SIGAIOCANCEL signal
+ * is detected.
+ * Normally this thread should get the cancellation signal during the
+ * kernel phase (reading or writing). In that case the signal handler
+ * aiosigcancelhndlr() is activated using the worker thread context,
+ * which again will use the siglongjmp() function to break the standard
+ * code flow and jump to the "sigsetjmp" position, provided that
+ * "work_cancel_flg" is set to "1".
+ * Because the "work_cancel_flg" is only manipulated by this worker
+ * thread and it can only run on one CPU at a given time, it is not
+ * necessary to protect that flag with the queue lock.
+ * Returning from the kernel (read or write system call) we must
+ * first disable the use of the SIGAIOCANCEL signal and accordingly
+ * the use of the siglongjmp() function to prevent a possible deadlock:
+ * - It can happens that this worker thread returns from the kernel and
+ * blocks in "work_qlock1",
+ * - then a second thread cancels the apparently "in progress" request
+ * and sends the SIGAIOCANCEL signal to the worker thread,
+ * - the worker thread gets assigned the "work_qlock1" and will returns
+ * from the kernel,
+ * - the kernel detects the pending signal and activates the signal
+ * handler instead,
+ * - if the "work_cancel_flg" is still set then the signal handler
+ * should use siglongjmp() to cancel the "in progress" request and
+ * it would try to acquire the same work_qlock1 in _aio_req_get()
+ * for a second time => deadlock.
+ * To avoid that situation we disable the cancellation of the request
+ * in progress BEFORE we try to acquire the work_qlock1.
+ * In that case the signal handler will not call siglongjmp() and the
+ * worker thread will continue running the standard code flow.
+ * Then this thread must check the AIO_REQ_CANCELED flag to emulate
+ * an eventually required siglongjmp() freeing the work_qlock1 and
+ * avoiding a deadlock.
+ */
+void *
+_aio_do_request(void *arglist)
+{
+ aio_worker_t *aiowp = (aio_worker_t *)arglist;
+ ulwp_t *self = curthread;
+ struct aio_args *arg;
+ aio_req_t *reqp; /* current AIO request */
+ ssize_t retval;
+ int error;
+
+ if (pthread_setspecific(_aio_key, aiowp) != 0)
+ aio_panic("_aio_do_request, pthread_setspecific()");
+ (void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
+ ASSERT(aiowp->work_req == NULL);
+
+ /*
+ * We resume here when an operation is cancelled.
+ * On first entry, aiowp->work_req == NULL, so all
+ * we do is block SIGAIOCANCEL.
+ */
+ (void) sigsetjmp(aiowp->work_jmp_buf, 0);
+ ASSERT(self->ul_sigdefer == 0);
+
+ sigoff(self); /* block SIGAIOCANCEL */
+ if (aiowp->work_req != NULL)
+ _aio_finish_request(aiowp, -1, ECANCELED);
+
+ for (;;) {
+ /*
+ * Put completed requests on aio_done_list. This has
+ * to be done as part of the main loop to ensure that
+ * we don't artificially starve any aiowait'ers.
+ */
+ if (aiowp->work_done1)
+ _aio_work_done(aiowp);
+
+top:
+ /* consume any deferred SIGAIOCANCEL signal here */
+ sigon(self);
+ sigoff(self);
+
+ while ((reqp = _aio_req_get(aiowp)) == NULL) {
+ if (_aio_idle(aiowp) != 0)
+ goto top;
+ }
+ arg = &reqp->req_args;
+ ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
+ reqp->req_state == AIO_REQ_CANCELED);
+ error = 0;
+
+ switch (reqp->req_op) {
+ case AIOREAD:
+ case AIOAREAD:
+ sigon(self); /* unblock SIGAIOCANCEL */
+ retval = pread(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = read(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ error = errno;
+ } else {
+ error = errno;
+ }
+ }
+ sigoff(self); /* block SIGAIOCANCEL */
+ break;
+ case AIOWRITE:
+ case AIOAWRITE:
+ sigon(self); /* unblock SIGAIOCANCEL */
+ retval = pwrite(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = write(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ error = errno;
+ } else {
+ error = errno;
+ }
+ }
+ sigoff(self); /* block SIGAIOCANCEL */
+ break;
+#if !defined(_LP64)
+ case AIOAREAD64:
+ sigon(self); /* unblock SIGAIOCANCEL */
+ retval = pread64(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = read(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ error = errno;
+ } else {
+ error = errno;
+ }
+ }
+ sigoff(self); /* block SIGAIOCANCEL */
+ break;
+ case AIOAWRITE64:
+ sigon(self); /* unblock SIGAIOCANCEL */
+ retval = pwrite64(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = write(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ error = errno;
+ } else {
+ error = errno;
+ }
+ }
+ sigoff(self); /* block SIGAIOCANCEL */
+ break;
+#endif /* !defined(_LP64) */
+ case AIOFSYNC:
+ if (_aio_fsync_del(aiowp, reqp))
+ goto top;
+ ASSERT(reqp->req_head == NULL);
+ /*
+ * All writes for this fsync request are now
+ * acknowledged. Now make these writes visible
+ * and put the final request into the hash table.
+ */
+ if (reqp->req_state == AIO_REQ_CANCELED) {
+ /* EMPTY */;
+ } else if (arg->offset == O_SYNC) {
+ if ((retval = __fdsync(arg->fd, FSYNC)) == -1)
+ error = errno;
+ } else {
+ if ((retval = __fdsync(arg->fd, FDSYNC)) == -1)
+ error = errno;
+ }
+ if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
+ aio_panic("_aio_do_request(): AIOFSYNC: "
+ "request already in hash table");
+ break;
+ default:
+ aio_panic("_aio_do_request, bad op");
+ }
+
+ _aio_finish_request(aiowp, retval, error);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Perform the tail processing for _aio_do_request().
+ * The in-progress request may or may not have been cancelled.
+ */
+static void
+_aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
+{
+ aio_req_t *reqp;
+
+ sig_mutex_lock(&aiowp->work_qlock1);
+ if ((reqp = aiowp->work_req) == NULL)
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ else {
+ aiowp->work_req = NULL;
+ if (reqp->req_state == AIO_REQ_CANCELED) {
+ retval = -1;
+ error = ECANCELED;
+ }
+ if (!POSIX_AIO(reqp)) {
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_lock(&__aio_mutex);
+ if (reqp->req_state == AIO_REQ_INPROGRESS)
+ reqp->req_state = AIO_REQ_DONE;
+ _aio_req_done_cnt++;
+ _aio_set_result(reqp, retval, error);
+ if (error == ECANCELED)
+ _aio_outstand_cnt--;
+ sig_mutex_unlock(&__aio_mutex);
+ } else {
+ if (reqp->req_state == AIO_REQ_INPROGRESS)
+ reqp->req_state = AIO_REQ_DONE;
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ _aiodone(reqp, retval, error);
+ }
+ }
+}
+
+void
+_aio_req_mark_done(aio_req_t *reqp)
+{
+#if !defined(_LP64)
+ if (reqp->req_largefile)
+ ((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
+ else
+#endif
+ ((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
+}
+
+/*
+ * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
+ * hopefully to consume one of our queued signals.
+ */
+static void
+_aio_delay(int ticks)
+{
+ (void) usleep(ticks * (MICROSEC / hz));
+}
+
+/*
+ * Actually send the notifications.
+ * We could block indefinitely here if the application
+ * is not listening for the signal or port notifications.
+ */
+static void
+send_notification(notif_param_t *npp)
+{
+ extern int __sigqueue(pid_t pid, int signo,
+ /* const union sigval */ void *value, int si_code, int block);
+
+ if (npp->np_signo)
+ (void) __sigqueue(__pid, npp->np_signo, npp->np_user,
+ SI_ASYNCIO, 1);
+ else if (npp->np_port >= 0)
+ (void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
+ npp->np_event, npp->np_object, npp->np_user);
+
+ if (npp->np_lio_signo)
+ (void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
+ SI_ASYNCIO, 1);
+ else if (npp->np_lio_port >= 0)
+ (void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
+ npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
+}
+
+/*
+ * Asynchronous notification worker.
+ */
+void *
+_aio_do_notify(void *arg)
+{
+ aio_worker_t *aiowp = (aio_worker_t *)arg;
+ aio_req_t *reqp;
+
+ /*
+ * This isn't really necessary. All signals are blocked.
+ */
+ if (pthread_setspecific(_aio_key, aiowp) != 0)
+ aio_panic("_aio_do_notify, pthread_setspecific()");
+
+ /*
+ * Notifications are never cancelled.
+ * All signals remain blocked, forever.
+ */
+ for (;;) {
+ while ((reqp = _aio_req_get(aiowp)) == NULL) {
+ if (_aio_idle(aiowp) != 0)
+ aio_panic("_aio_do_notify: _aio_idle() failed");
+ }
+ send_notification(&reqp->req_notify);
+ _aio_req_free(reqp);
+ }
+
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Do the completion semantics for a request that was either canceled
+ * by _aio_cancel_req() or was completed by _aio_do_request().
+ */
+static void
+_aiodone(aio_req_t *reqp, ssize_t retval, int error)
+{
+ aio_result_t *resultp = reqp->req_resultp;
+ int notify = 0;
+ aio_lio_t *head;
+ int sigev_none;
+ int sigev_signal;
+ int sigev_thread;
+ int sigev_port;
+ notif_param_t np;
+
+ /*
+ * We call _aiodone() only for Posix I/O.
+ */
+ ASSERT(POSIX_AIO(reqp));
+
+ sigev_none = 0;
+ sigev_signal = 0;
+ sigev_thread = 0;
+ sigev_port = 0;
+ np.np_signo = 0;
+ np.np_port = -1;
+ np.np_lio_signo = 0;
+ np.np_lio_port = -1;
+
+ switch (reqp->req_sigevent.sigev_notify) {
+ case SIGEV_NONE:
+ sigev_none = 1;
+ break;
+ case SIGEV_SIGNAL:
+ sigev_signal = 1;
+ break;
+ case SIGEV_THREAD:
+ sigev_thread = 1;
+ break;
+ case SIGEV_PORT:
+ sigev_port = 1;
+ break;
+ default:
+ aio_panic("_aiodone: improper sigev_notify");
+ break;
+ }
+
+ /*
+ * Figure out the notification parameters while holding __aio_mutex.
+ * Actually perform the notifications after dropping __aio_mutex.
+ * This allows us to sleep for a long time (if the notifications
+ * incur delays) without impeding other async I/O operations.
+ */
+
+ sig_mutex_lock(&__aio_mutex);
+
+ if (sigev_signal) {
+ if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
+ notify = 1;
+ np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
+ } else if (sigev_thread | sigev_port) {
+ if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
+ notify = 1;
+ np.np_event = reqp->req_op;
+ if (np.np_event == AIOFSYNC && reqp->req_largefile)
+ np.np_event = AIOFSYNC64;
+ np.np_object = (uintptr_t)reqp->req_aiocbp;
+ np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
+ }
+
+ if (resultp->aio_errno == EINPROGRESS)
+ _aio_set_result(reqp, retval, error);
+
+ _aio_outstand_cnt--;
+
+ head = reqp->req_head;
+ reqp->req_head = NULL;
+
+ if (sigev_none) {
+ _aio_enq_doneq(reqp);
+ reqp = NULL;
+ } else {
+ (void) _aio_hash_del(resultp);
+ _aio_req_mark_done(reqp);
+ }
+
+ _aio_waitn_wakeup();
+
+ /*
+ * __aio_waitn() sets AIO_WAIT_INPROGRESS and
+ * __aio_suspend() increments "_aio_kernel_suspend"
+ * when they are waiting in the kernel for completed I/Os.
+ *
+ * _kaio(AIONOTIFY) awakes the corresponding function
+ * in the kernel; then the corresponding __aio_waitn() or
+ * __aio_suspend() function could reap the recently
+ * completed I/Os (_aiodone()).
+ */
+ if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
+ (void) _kaio(AIONOTIFY);
+
+ sig_mutex_unlock(&__aio_mutex);
+
+ if (head != NULL) {
+ /*
+ * If all the lio requests have completed,
+ * prepare to notify the waiting thread.
+ */
+ sig_mutex_lock(&head->lio_mutex);
+ ASSERT(head->lio_refcnt == head->lio_nent);
+ if (head->lio_refcnt == 1) {
+ int waiting = 0;
+ if (head->lio_mode == LIO_WAIT) {
+ if ((waiting = head->lio_waiting) != 0)
+ (void) cond_signal(&head->lio_cond_cv);
+ } else if (head->lio_port < 0) { /* none or signal */
+ if ((np.np_lio_signo = head->lio_signo) != 0)
+ notify = 1;
+ np.np_lio_user = head->lio_sigval.sival_ptr;
+ } else { /* thread or port */
+ notify = 1;
+ np.np_lio_port = head->lio_port;
+ np.np_lio_event = head->lio_event;
+ np.np_lio_object =
+ (uintptr_t)head->lio_sigevent;
+ np.np_lio_user = head->lio_sigval.sival_ptr;
+ }
+ head->lio_nent = head->lio_refcnt = 0;
+ sig_mutex_unlock(&head->lio_mutex);
+ if (waiting == 0)
+ _aio_lio_free(head);
+ } else {
+ head->lio_nent--;
+ head->lio_refcnt--;
+ sig_mutex_unlock(&head->lio_mutex);
+ }
+ }
+
+ /*
+ * The request is completed; now perform the notifications.
+ */
+ if (notify) {
+ if (reqp != NULL) {
+ /*
+ * We usually put the request on the notification
+ * queue because we don't want to block and delay
+ * other operations behind us in the work queue.
+ * Also we must never block on a cancel notification
+ * because we are being called from an application
+ * thread in this case and that could lead to deadlock
+ * if no other thread is receiving notificatins.
+ */
+ reqp->req_notify = np;
+ reqp->req_op = AIONOTIFY;
+ _aio_req_add(reqp, &__workers_no, AIONOTIFY);
+ reqp = NULL;
+ } else {
+ /*
+ * We already put the request on the done queue,
+ * so we can't queue it to the notification queue.
+ * Just do the notification directly.
+ */
+ send_notification(&np);
+ }
+ }
+
+ if (reqp != NULL)
+ _aio_req_free(reqp);
+}
+
+/*
+ * Delete fsync requests from list head until there is
+ * only one left. Return 0 when there is only one,
+ * otherwise return a non-zero value.
+ */
+static int
+_aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
+{
+ aio_lio_t *head = reqp->req_head;
+ int rval = 0;
+
+ ASSERT(reqp == aiowp->work_req);
+ sig_mutex_lock(&aiowp->work_qlock1);
+ sig_mutex_lock(&head->lio_mutex);
+ if (head->lio_refcnt > 1) {
+ head->lio_refcnt--;
+ head->lio_nent--;
+ aiowp->work_req = NULL;
+ sig_mutex_unlock(&head->lio_mutex);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_lock(&__aio_mutex);
+ _aio_outstand_cnt--;
+ _aio_waitn_wakeup();
+ sig_mutex_unlock(&__aio_mutex);
+ _aio_req_free(reqp);
+ return (1);
+ }
+ ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
+ reqp->req_head = NULL;
+ if (head->lio_canned)
+ reqp->req_state = AIO_REQ_CANCELED;
+ if (head->lio_mode == LIO_DESTROY) {
+ aiowp->work_req = NULL;
+ rval = 1;
+ }
+ sig_mutex_unlock(&head->lio_mutex);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ head->lio_refcnt--;
+ head->lio_nent--;
+ _aio_lio_free(head);
+ if (rval != 0)
+ _aio_req_free(reqp);
+ return (rval);
+}
+
+/*
+ * A worker is set idle when its work queue is empty.
+ * The worker checks again that it has no more work
+ * and then goes to sleep waiting for more work.
+ */
+int
+_aio_idle(aio_worker_t *aiowp)
+{
+ int error = 0;
+
+ sig_mutex_lock(&aiowp->work_qlock1);
+ if (aiowp->work_count1 == 0) {
+ ASSERT(aiowp->work_minload1 == 0);
+ aiowp->work_idleflg = 1;
+ /*
+ * A cancellation handler is not needed here.
+ * aio worker threads are never cancelled via pthread_cancel().
+ */
+ error = sig_cond_wait(&aiowp->work_idle_cv,
+ &aiowp->work_qlock1);
+ /*
+ * The idle flag is normally cleared before worker is awakened
+ * by aio_req_add(). On error (EINTR), we clear it ourself.
+ */
+ if (error)
+ aiowp->work_idleflg = 0;
+ }
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ return (error);
+}
+
+/*
+ * A worker's completed AIO requests are placed onto a global
+ * done queue. The application is only sent a SIGIO signal if
+ * the process has a handler enabled and it is not waiting via
+ * aiowait().
+ */
+static void
+_aio_work_done(aio_worker_t *aiowp)
+{
+ aio_req_t *reqp;
+
+ sig_mutex_lock(&aiowp->work_qlock1);
+ reqp = aiowp->work_prev1;
+ reqp->req_next = NULL;
+ aiowp->work_done1 = 0;
+ aiowp->work_tail1 = aiowp->work_next1;
+ if (aiowp->work_tail1 == NULL)
+ aiowp->work_head1 = NULL;
+ aiowp->work_prev1 = NULL;
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_lock(&__aio_mutex);
+ _aio_donecnt++;
+ _aio_outstand_cnt--;
+ _aio_req_done_cnt--;
+ ASSERT(_aio_donecnt > 0 &&
+ _aio_outstand_cnt >= 0 &&
+ _aio_req_done_cnt >= 0);
+ ASSERT(reqp != NULL);
+
+ if (_aio_done_tail == NULL) {
+ _aio_done_head = _aio_done_tail = reqp;
+ } else {
+ _aio_done_head->req_next = reqp;
+ _aio_done_head = reqp;
+ }
+
+ if (_aiowait_flag) {
+ sig_mutex_unlock(&__aio_mutex);
+ (void) _kaio(AIONOTIFY);
+ } else {
+ sig_mutex_unlock(&__aio_mutex);
+ if (_sigio_enabled)
+ (void) kill(__pid, SIGIO);
+ }
+}
+
+/*
+ * The done queue consists of AIO requests that are in either the
+ * AIO_REQ_DONE or AIO_REQ_CANCELED state. Requests that were cancelled
+ * are discarded. If the done queue is empty then NULL is returned.
+ * Otherwise the address of a done aio_result_t is returned.
+ */
+aio_result_t *
+_aio_req_done(void)
+{
+ aio_req_t *reqp;
+ aio_result_t *resultp;
+
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+
+ if ((reqp = _aio_done_tail) != NULL) {
+ if ((_aio_done_tail = reqp->req_next) == NULL)
+ _aio_done_head = NULL;
+ ASSERT(_aio_donecnt > 0);
+ _aio_donecnt--;
+ (void) _aio_hash_del(reqp->req_resultp);
+ resultp = reqp->req_resultp;
+ ASSERT(reqp->req_state == AIO_REQ_DONE);
+ _aio_req_free(reqp);
+ return (resultp);
+ }
+ /* is queue empty? */
+ if (reqp == NULL && _aio_outstand_cnt == 0) {
+ return ((aio_result_t *)-1);
+ }
+ return (NULL);
+}
+
+/*
+ * Set the return and errno values for the application's use.
+ *
+ * For the Posix interfaces, we must set the return value first followed
+ * by the errno value because the Posix interfaces allow for a change
+ * in the errno value from EINPROGRESS to something else to signal
+ * the completion of the asynchronous request.
+ *
+ * The opposite is true for the Solaris interfaces. These allow for
+ * a change in the return value from AIO_INPROGRESS to something else
+ * to signal the completion of the asynchronous request.
+ */
+void
+_aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
+{
+ aio_result_t *resultp = reqp->req_resultp;
+
+ if (POSIX_AIO(reqp)) {
+ resultp->aio_return = retval;
+ membar_producer();
+ resultp->aio_errno = error;
+ } else {
+ resultp->aio_errno = error;
+ membar_producer();
+ resultp->aio_return = retval;
+ }
+}
+
+/*
+ * Add an AIO request onto the next work queue.
+ * A circular list of workers is used to choose the next worker.
+ */
+void
+_aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
+{
+ ulwp_t *self = curthread;
+ aio_worker_t *aiowp;
+ aio_worker_t *first;
+ int load_bal_flg = 1;
+ int found;
+
+ ASSERT(reqp->req_state != AIO_REQ_DONEQ);
+ reqp->req_next = NULL;
+ /*
+ * Try to acquire the next worker's work queue. If it is locked,
+ * then search the list of workers until a queue is found unlocked,
+ * or until the list is completely traversed at which point another
+ * worker will be created.
+ */
+ sigoff(self); /* defer SIGIO */
+ sig_mutex_lock(&__aio_mutex);
+ first = aiowp = *nextworker;
+ if (mode != AIONOTIFY)
+ _aio_outstand_cnt++;
+ sig_mutex_unlock(&__aio_mutex);
+
+ switch (mode) {
+ case AIOREAD:
+ case AIOWRITE:
+ case AIOAREAD:
+ case AIOAWRITE:
+#if !defined(_LP64)
+ case AIOAREAD64:
+ case AIOAWRITE64:
+#endif
+ /* try to find an idle worker */
+ found = 0;
+ do {
+ if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
+ if (aiowp->work_idleflg) {
+ found = 1;
+ break;
+ }
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ }
+ } while ((aiowp = aiowp->work_forw) != first);
+
+ if (found) {
+ aiowp->work_minload1++;
+ break;
+ }
+
+ /* try to acquire some worker's queue lock */
+ do {
+ if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
+ found = 1;
+ break;
+ }
+ } while ((aiowp = aiowp->work_forw) != first);
+
+ /*
+ * Create more workers when the workers appear overloaded.
+ * Either all the workers are busy draining their queues
+ * or no worker's queue lock could be acquired.
+ */
+ if (!found) {
+ if (_aio_worker_cnt < _max_workers) {
+ if (_aio_create_worker(reqp, mode))
+ aio_panic("_aio_req_add: add worker");
+ sigon(self); /* reenable SIGIO */
+ return;
+ }
+
+ /*
+ * No worker available and we have created
+ * _max_workers, keep going through the
+ * list slowly until we get a lock
+ */
+ while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
+ /*
+ * give someone else a chance
+ */
+ _aio_delay(1);
+ aiowp = aiowp->work_forw;
+ }
+ }
+
+ ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
+ if (_aio_worker_cnt < _max_workers &&
+ aiowp->work_minload1 >= _minworkload) {
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_lock(&__aio_mutex);
+ *nextworker = aiowp->work_forw;
+ sig_mutex_unlock(&__aio_mutex);
+ if (_aio_create_worker(reqp, mode))
+ aio_panic("aio_req_add: add worker");
+ sigon(self); /* reenable SIGIO */
+ return;
+ }
+ aiowp->work_minload1++;
+ break;
+ case AIOFSYNC:
+ case AIONOTIFY:
+ load_bal_flg = 0;
+ sig_mutex_lock(&aiowp->work_qlock1);
+ break;
+ default:
+ aio_panic("_aio_req_add: invalid mode");
+ break;
+ }
+ /*
+ * Put request onto worker's work queue.
+ */
+ if (aiowp->work_tail1 == NULL) {
+ ASSERT(aiowp->work_count1 == 0);
+ aiowp->work_tail1 = reqp;
+ aiowp->work_next1 = reqp;
+ } else {
+ aiowp->work_head1->req_next = reqp;
+ if (aiowp->work_next1 == NULL)
+ aiowp->work_next1 = reqp;
+ }
+ reqp->req_state = AIO_REQ_QUEUED;
+ reqp->req_worker = aiowp;
+ aiowp->work_head1 = reqp;
+ /*
+ * Awaken worker if it is not currently active.
+ */
+ if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
+ aiowp->work_idleflg = 0;
+ (void) cond_signal(&aiowp->work_idle_cv);
+ }
+ sig_mutex_unlock(&aiowp->work_qlock1);
+
+ if (load_bal_flg) {
+ sig_mutex_lock(&__aio_mutex);
+ *nextworker = aiowp->work_forw;
+ sig_mutex_unlock(&__aio_mutex);
+ }
+ sigon(self); /* reenable SIGIO */
+}
+
+/*
+ * Get an AIO request for a specified worker.
+ * If the work queue is empty, return NULL.
+ */
+aio_req_t *
+_aio_req_get(aio_worker_t *aiowp)
+{
+ aio_req_t *reqp;
+
+ sig_mutex_lock(&aiowp->work_qlock1);
+ if ((reqp = aiowp->work_next1) != NULL) {
+ /*
+ * Remove a POSIX request from the queue; the
+ * request queue is a singularly linked list
+ * with a previous pointer. The request is
+ * removed by updating the previous pointer.
+ *
+ * Non-posix requests are left on the queue
+ * to eventually be placed on the done queue.
+ */
+
+ if (POSIX_AIO(reqp)) {
+ if (aiowp->work_prev1 == NULL) {
+ aiowp->work_tail1 = reqp->req_next;
+ if (aiowp->work_tail1 == NULL)
+ aiowp->work_head1 = NULL;
+ } else {
+ aiowp->work_prev1->req_next = reqp->req_next;
+ if (aiowp->work_head1 == reqp)
+ aiowp->work_head1 = reqp->req_next;
+ }
+
+ } else {
+ aiowp->work_prev1 = reqp;
+ ASSERT(aiowp->work_done1 >= 0);
+ aiowp->work_done1++;
+ }
+ ASSERT(reqp != reqp->req_next);
+ aiowp->work_next1 = reqp->req_next;
+ ASSERT(aiowp->work_count1 >= 1);
+ aiowp->work_count1--;
+ switch (reqp->req_op) {
+ case AIOREAD:
+ case AIOWRITE:
+ case AIOAREAD:
+ case AIOAWRITE:
+#if !defined(_LP64)
+ case AIOAREAD64:
+ case AIOAWRITE64:
+#endif
+ ASSERT(aiowp->work_minload1 > 0);
+ aiowp->work_minload1--;
+ break;
+ }
+ reqp->req_state = AIO_REQ_INPROGRESS;
+ }
+ aiowp->work_req = reqp;
+ ASSERT(reqp != NULL || aiowp->work_count1 == 0);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ return (reqp);
+}
+
+static void
+_aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
+{
+ aio_req_t **last;
+ aio_req_t *lastrp;
+ aio_req_t *next;
+
+ ASSERT(aiowp != NULL);
+ ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
+ if (POSIX_AIO(reqp)) {
+ if (ostate != AIO_REQ_QUEUED)
+ return;
+ }
+ last = &aiowp->work_tail1;
+ lastrp = aiowp->work_tail1;
+ ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
+ while ((next = *last) != NULL) {
+ if (next == reqp) {
+ *last = next->req_next;
+ if (aiowp->work_next1 == next)
+ aiowp->work_next1 = next->req_next;
+
+ if ((next->req_next != NULL) ||
+ (aiowp->work_done1 == 0)) {
+ if (aiowp->work_head1 == next)
+ aiowp->work_head1 = next->req_next;
+ if (aiowp->work_prev1 == next)
+ aiowp->work_prev1 = next->req_next;
+ } else {
+ if (aiowp->work_head1 == next)
+ aiowp->work_head1 = lastrp;
+ if (aiowp->work_prev1 == next)
+ aiowp->work_prev1 = lastrp;
+ }
+
+ if (ostate == AIO_REQ_QUEUED) {
+ ASSERT(aiowp->work_count1 >= 1);
+ aiowp->work_count1--;
+ ASSERT(aiowp->work_minload1 >= 1);
+ aiowp->work_minload1--;
+ } else {
+ ASSERT(ostate == AIO_REQ_INPROGRESS &&
+ !POSIX_AIO(reqp));
+ aiowp->work_done1--;
+ }
+ return;
+ }
+ last = &next->req_next;
+ lastrp = next;
+ }
+ /* NOTREACHED */
+}
+
+static void
+_aio_enq_doneq(aio_req_t *reqp)
+{
+ if (_aio_doneq == NULL) {
+ _aio_doneq = reqp;
+ reqp->req_next = reqp->req_prev = reqp;
+ } else {
+ reqp->req_next = _aio_doneq;
+ reqp->req_prev = _aio_doneq->req_prev;
+ _aio_doneq->req_prev->req_next = reqp;
+ _aio_doneq->req_prev = reqp;
+ }
+ reqp->req_state = AIO_REQ_DONEQ;
+ _aio_doneq_cnt++;
+}
+
+/*
+ * caller owns the _aio_mutex
+ */
+aio_req_t *
+_aio_req_remove(aio_req_t *reqp)
+{
+ if (reqp && reqp->req_state != AIO_REQ_DONEQ)
+ return (NULL);
+
+ if (reqp) {
+ /* request in done queue */
+ if (_aio_doneq == reqp)
+ _aio_doneq = reqp->req_next;
+ if (_aio_doneq == reqp) {
+ /* only one request on queue */
+ _aio_doneq = NULL;
+ } else {
+ aio_req_t *tmp = reqp->req_next;
+ reqp->req_prev->req_next = tmp;
+ tmp->req_prev = reqp->req_prev;
+ }
+ } else if ((reqp = _aio_doneq) != NULL) {
+ if (reqp == reqp->req_next) {
+ /* only one request on queue */
+ _aio_doneq = NULL;
+ } else {
+ reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
+ _aio_doneq->req_prev = reqp->req_prev;
+ }
+ }
+ if (reqp) {
+ _aio_doneq_cnt--;
+ reqp->req_next = reqp->req_prev = reqp;
+ reqp->req_state = AIO_REQ_DONE;
+ }
+ return (reqp);
+}
+
+/*
+ * An AIO request is identified by an aio_result_t pointer. The library
+ * maps this aio_result_t pointer to its internal representation using a
+ * hash table. This function adds an aio_result_t pointer to the hash table.
+ */
+static int
+_aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
+{
+ aio_hash_t *hashp;
+ aio_req_t **prev;
+ aio_req_t *next;
+
+ hashp = _aio_hash + AIOHASH(resultp);
+ lmutex_lock(&hashp->hash_lock);
+ prev = &hashp->hash_ptr;
+ while ((next = *prev) != NULL) {
+ if (resultp == next->req_resultp) {
+ lmutex_unlock(&hashp->hash_lock);
+ return (-1);
+ }
+ prev = &next->req_link;
+ }
+ *prev = reqp;
+ ASSERT(reqp->req_link == NULL);
+ lmutex_unlock(&hashp->hash_lock);
+ return (0);
+}
+
+/*
+ * Remove an entry from the hash table.
+ */
+aio_req_t *
+_aio_hash_del(aio_result_t *resultp)
+{
+ aio_hash_t *hashp;
+ aio_req_t **prev;
+ aio_req_t *next = NULL;
+
+ if (_aio_hash != NULL) {
+ hashp = _aio_hash + AIOHASH(resultp);
+ lmutex_lock(&hashp->hash_lock);
+ prev = &hashp->hash_ptr;
+ while ((next = *prev) != NULL) {
+ if (resultp == next->req_resultp) {
+ *prev = next->req_link;
+ next->req_link = NULL;
+ break;
+ }
+ prev = &next->req_link;
+ }
+ lmutex_unlock(&hashp->hash_lock);
+ }
+ return (next);
+}
+
+/*
+ * find an entry in the hash table
+ */
+aio_req_t *
+_aio_hash_find(aio_result_t *resultp)
+{
+ aio_hash_t *hashp;
+ aio_req_t **prev;
+ aio_req_t *next = NULL;
+
+ if (_aio_hash != NULL) {
+ hashp = _aio_hash + AIOHASH(resultp);
+ lmutex_lock(&hashp->hash_lock);
+ prev = &hashp->hash_ptr;
+ while ((next = *prev) != NULL) {
+ if (resultp == next->req_resultp)
+ break;
+ prev = &next->req_link;
+ }
+ lmutex_unlock(&hashp->hash_lock);
+ }
+ return (next);
+}
+
+/*
+ * AIO interface for POSIX
+ */
+int
+_aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
+ int mode, int flg)
+{
+ aio_req_t *reqp;
+ aio_args_t *ap;
+ int kerr;
+
+ if (aiocbp == NULL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /* initialize kaio */
+ if (!_kaio_ok)
+ _kaio_init();
+
+ aiocbp->aio_state = NOCHECK;
+
+ /*
+ * If we have been called because a list I/O
+ * kaio() failed, we dont want to repeat the
+ * system call
+ */
+
+ if (flg & AIO_KAIO) {
+ /*
+ * Try kernel aio first.
+ * If errno is ENOTSUP/EBADFD,
+ * fall back to the thread implementation.
+ */
+ if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
+ aiocbp->aio_resultp.aio_errno = EINPROGRESS;
+ aiocbp->aio_state = CHECK;
+ kerr = (int)_kaio(mode, aiocbp);
+ if (kerr == 0)
+ return (0);
+ if (errno != ENOTSUP && errno != EBADFD) {
+ aiocbp->aio_resultp.aio_errno = errno;
+ aiocbp->aio_resultp.aio_return = -1;
+ aiocbp->aio_state = NOCHECK;
+ return (-1);
+ }
+ if (errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
+ }
+ }
+
+ aiocbp->aio_resultp.aio_errno = EINPROGRESS;
+ aiocbp->aio_state = USERAIO;
+
+ if (!__uaio_ok && __uaio_init() == -1)
+ return (-1);
+
+ if ((reqp = _aio_req_alloc()) == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * If an LIO request, add the list head to the aio request
+ */
+ reqp->req_head = lio_head;
+ reqp->req_type = AIO_POSIX_REQ;
+ reqp->req_op = mode;
+ reqp->req_largefile = 0;
+
+ if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
+ reqp->req_sigevent.sigev_notify = SIGEV_NONE;
+ } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
+ reqp->req_sigevent.sigev_signo =
+ aiocbp->aio_sigevent.sigev_signo;
+ reqp->req_sigevent.sigev_value.sival_ptr =
+ aiocbp->aio_sigevent.sigev_value.sival_ptr;
+ } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
+ port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
+ reqp->req_sigevent.sigev_notify = SIGEV_PORT;
+ /*
+ * Reuse the sigevent structure to contain the port number
+ * and the user value. Same for SIGEV_THREAD, below.
+ */
+ reqp->req_sigevent.sigev_signo =
+ pn->portnfy_port;
+ reqp->req_sigevent.sigev_value.sival_ptr =
+ pn->portnfy_user;
+ } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
+ reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
+ /*
+ * The sigevent structure contains the port number
+ * and the user value. Same for SIGEV_PORT, above.
+ */
+ reqp->req_sigevent.sigev_signo =
+ aiocbp->aio_sigevent.sigev_signo;
+ reqp->req_sigevent.sigev_value.sival_ptr =
+ aiocbp->aio_sigevent.sigev_value.sival_ptr;
+ }
+
+ reqp->req_resultp = &aiocbp->aio_resultp;
+ reqp->req_aiocbp = aiocbp;
+ ap = &reqp->req_args;
+ ap->fd = aiocbp->aio_fildes;
+ ap->buf = (caddr_t)aiocbp->aio_buf;
+ ap->bufsz = aiocbp->aio_nbytes;
+ ap->offset = aiocbp->aio_offset;
+
+ if ((flg & AIO_NO_DUPS) &&
+ _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
+ aio_panic("_aio_rw(): request already in hash table");
+ _aio_req_free(reqp);
+ errno = EINVAL;
+ return (-1);
+ }
+ _aio_req_add(reqp, nextworker, mode);
+ return (0);
+}
+
+#if !defined(_LP64)
+/*
+ * 64-bit AIO interface for POSIX
+ */
+int
+_aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
+ int mode, int flg)
+{
+ aio_req_t *reqp;
+ aio_args_t *ap;
+ int kerr;
+
+ if (aiocbp == NULL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /* initialize kaio */
+ if (!_kaio_ok)
+ _kaio_init();
+
+ aiocbp->aio_state = NOCHECK;
+
+ /*
+ * If we have been called because a list I/O
+ * kaio() failed, we dont want to repeat the
+ * system call
+ */
+
+ if (flg & AIO_KAIO) {
+ /*
+ * Try kernel aio first.
+ * If errno is ENOTSUP/EBADFD,
+ * fall back to the thread implementation.
+ */
+ if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
+ aiocbp->aio_resultp.aio_errno = EINPROGRESS;
+ aiocbp->aio_state = CHECK;
+ kerr = (int)_kaio(mode, aiocbp);
+ if (kerr == 0)
+ return (0);
+ if (errno != ENOTSUP && errno != EBADFD) {
+ aiocbp->aio_resultp.aio_errno = errno;
+ aiocbp->aio_resultp.aio_return = -1;
+ aiocbp->aio_state = NOCHECK;
+ return (-1);
+ }
+ if (errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
+ }
+ }
+
+ aiocbp->aio_resultp.aio_errno = EINPROGRESS;
+ aiocbp->aio_state = USERAIO;
+
+ if (!__uaio_ok && __uaio_init() == -1)
+ return (-1);
+
+ if ((reqp = _aio_req_alloc()) == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * If an LIO request, add the list head to the aio request
+ */
+ reqp->req_head = lio_head;
+ reqp->req_type = AIO_POSIX_REQ;
+ reqp->req_op = mode;
+ reqp->req_largefile = 1;
+
+ if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
+ reqp->req_sigevent.sigev_notify = SIGEV_NONE;
+ } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
+ reqp->req_sigevent.sigev_signo =
+ aiocbp->aio_sigevent.sigev_signo;
+ reqp->req_sigevent.sigev_value.sival_ptr =
+ aiocbp->aio_sigevent.sigev_value.sival_ptr;
+ } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
+ port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
+ reqp->req_sigevent.sigev_notify = SIGEV_PORT;
+ reqp->req_sigevent.sigev_signo =
+ pn->portnfy_port;
+ reqp->req_sigevent.sigev_value.sival_ptr =
+ pn->portnfy_user;
+ } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
+ reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
+ reqp->req_sigevent.sigev_signo =
+ aiocbp->aio_sigevent.sigev_signo;
+ reqp->req_sigevent.sigev_value.sival_ptr =
+ aiocbp->aio_sigevent.sigev_value.sival_ptr;
+ }
+
+ reqp->req_resultp = &aiocbp->aio_resultp;
+ reqp->req_aiocbp = aiocbp;
+ ap = &reqp->req_args;
+ ap->fd = aiocbp->aio_fildes;
+ ap->buf = (caddr_t)aiocbp->aio_buf;
+ ap->bufsz = aiocbp->aio_nbytes;
+ ap->offset = aiocbp->aio_offset;
+
+ if ((flg & AIO_NO_DUPS) &&
+ _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
+ aio_panic("_aio_rw64(): request already in hash table");
+ _aio_req_free(reqp);
+ errno = EINVAL;
+ return (-1);
+ }
+ _aio_req_add(reqp, nextworker, mode);
+ return (0);
+}
+#endif /* !defined(_LP64) */
diff --git a/usr/src/lib/libc/port/aio/aio_alloc.c b/usr/src/lib/libc/port/aio/aio_alloc.c
new file mode 100644
index 0000000000..db919872e4
--- /dev/null
+++ b/usr/src/lib/libc/port/aio/aio_alloc.c
@@ -0,0 +1,435 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include "thr_uberdata.h"
+#include "asyncio.h"
+
+/*
+ * The aio subsystem memory allocation strategy:
+ *
+ * For each of the structure types we wish to allocate/free
+ * (aio_worker_t, aio_req_t, aio_lio_t), we use mmap() to allocate
+ * chunks of memory which are then subdivided into individual
+ * elements which are put into a free list from which allocations
+ * are made and to which frees are returned.
+ *
+ * Chunks start small (8 Kbytes) and get larger (size doubling)
+ * as more chunks are needed. This keeps memory usage small for
+ * light use and fragmentation small for heavy use.
+ *
+ * Chunks are never unmapped except as an aftermath of fork()
+ * in the child process, when they are all unmapped (because
+ * all of the worker threads disappear in the child).
+ */
+
+#define INITIAL_CHUNKSIZE (8 * 1024)
+
+/*
+ * The header structure for each chunk.
+ * A pointer and a size_t ensures proper alignment for whatever follows.
+ */
+typedef struct chunk {
+ struct chunk *chunk_next; /* linked list */
+ size_t chunk_size; /* size of this chunk */
+} chunk_t;
+
+chunk_t *chunk_list = NULL; /* list of all chunks */
+mutex_t chunk_lock = DEFAULTMUTEX;
+
+chunk_t *
+chunk_alloc(size_t size)
+{
+ chunk_t *chp = NULL;
+ void *ptr;
+
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
+ if (ptr != MAP_FAILED) {
+ lmutex_lock(&chunk_lock);
+ chp = ptr;
+ chp->chunk_next = chunk_list;
+ chunk_list = chp;
+ chp->chunk_size = size;
+ lmutex_unlock(&chunk_lock);
+ }
+
+ return (chp);
+}
+
+aio_worker_t *worker_freelist = NULL; /* free list of worker structures */
+aio_worker_t *worker_freelast = NULL;
+size_t worker_chunksize = 0;
+mutex_t worker_lock = DEFAULTMUTEX;
+
+/*
+ * Allocate a worker control block.
+ */
+aio_worker_t *
+_aio_worker_alloc(void)
+{
+ aio_worker_t *aiowp;
+ chunk_t *chp;
+ size_t chunksize;
+ int nelem;
+ int i;
+
+ lmutex_lock(&worker_lock);
+ if ((aiowp = worker_freelist) == NULL) {
+ if ((chunksize = 2 * worker_chunksize) == 0)
+ chunksize = INITIAL_CHUNKSIZE;
+ if ((chp = chunk_alloc(chunksize)) == NULL) {
+ lmutex_unlock(&worker_lock);
+ return (NULL);
+ }
+ worker_chunksize = chunksize;
+ worker_freelist = (aio_worker_t *)(uintptr_t)(chp + 1);
+ nelem = (chunksize - sizeof (chunk_t)) / sizeof (aio_worker_t);
+ for (i = 0, aiowp = worker_freelist; i < nelem; i++, aiowp++)
+ aiowp->work_forw = aiowp + 1;
+ worker_freelast = aiowp - 1;
+ worker_freelast->work_forw = NULL;
+ aiowp = worker_freelist;
+ }
+ if ((worker_freelist = aiowp->work_forw) == NULL)
+ worker_freelast = NULL;
+ lmutex_unlock(&worker_lock);
+
+ aiowp->work_forw = NULL;
+ (void) mutex_init(&aiowp->work_qlock1, USYNC_THREAD, NULL);
+ (void) cond_init(&aiowp->work_idle_cv, USYNC_THREAD, NULL);
+
+ return (aiowp);
+}
+
+/*
+ * Free a worker control block.
+ * Declared with void *arg so it can be a pthread_key_create() destructor.
+ */
+void
+_aio_worker_free(void *arg)
+{
+ aio_worker_t *aiowp = arg;
+
+ (void) mutex_destroy(&aiowp->work_qlock1);
+ (void) cond_destroy(&aiowp->work_idle_cv);
+ (void) memset(aiowp, 0, sizeof (*aiowp));
+
+ lmutex_lock(&worker_lock);
+ if (worker_freelast == NULL) {
+ worker_freelist = worker_freelast = aiowp;
+ } else {
+ worker_freelast->work_forw = aiowp;
+ worker_freelast = aiowp;
+ }
+ lmutex_unlock(&worker_lock);
+}
+
+aio_req_t *_aio_freelist = NULL; /* free list of request structures */
+aio_req_t *_aio_freelast = NULL;
+size_t request_chunksize = 0;
+int _aio_freelist_cnt = 0;
+int _aio_allocated_cnt = 0;
+mutex_t __aio_cache_lock = DEFAULTMUTEX;
+
+/*
+ * Allocate an aio request structure.
+ */
+aio_req_t *
+_aio_req_alloc(void)
+{
+ aio_req_t *reqp;
+ chunk_t *chp;
+ size_t chunksize;
+ int nelem;
+ int i;
+
+ lmutex_lock(&__aio_cache_lock);
+ if ((reqp = _aio_freelist) == NULL) {
+ if ((chunksize = 2 * request_chunksize) == 0)
+ chunksize = INITIAL_CHUNKSIZE;
+ if ((chp = chunk_alloc(chunksize)) == NULL) {
+ lmutex_unlock(&__aio_cache_lock);
+ return (NULL);
+ }
+ request_chunksize = chunksize;
+ _aio_freelist = (aio_req_t *)(uintptr_t)(chp + 1);
+ nelem = (chunksize - sizeof (chunk_t)) / sizeof (aio_req_t);
+ for (i = 0, reqp = _aio_freelist; i < nelem; i++, reqp++) {
+ reqp->req_state = AIO_REQ_FREE;
+ reqp->req_link = reqp + 1;
+ }
+ _aio_freelast = reqp - 1;
+ _aio_freelast->req_link = NULL;
+ _aio_freelist_cnt = nelem;
+ reqp = _aio_freelist;
+ }
+ if ((_aio_freelist = reqp->req_link) == NULL)
+ _aio_freelast = NULL;
+ _aio_freelist_cnt--;
+ _aio_allocated_cnt++;
+ lmutex_unlock(&__aio_cache_lock);
+
+ ASSERT(reqp->req_state == AIO_REQ_FREE);
+ reqp->req_state = 0;
+ reqp->req_link = NULL;
+ reqp->req_sigevent.sigev_notify = SIGEV_NONE;
+
+ return (reqp);
+}
+
+/*
+ * Free an aio request structure.
+ */
+void
+_aio_req_free(aio_req_t *reqp)
+{
+ ASSERT(reqp->req_state != AIO_REQ_FREE &&
+ reqp->req_state != AIO_REQ_DONEQ);
+ (void) memset(reqp, 0, sizeof (*reqp));
+ reqp->req_state = AIO_REQ_FREE;
+
+ lmutex_lock(&__aio_cache_lock);
+ if (_aio_freelast == NULL) {
+ _aio_freelist = _aio_freelast = reqp;
+ } else {
+ _aio_freelast->req_link = reqp;
+ _aio_freelast = reqp;
+ }
+ _aio_freelist_cnt++;
+ _aio_allocated_cnt--;
+ lmutex_unlock(&__aio_cache_lock);
+}
+
+aio_lio_t *_lio_head_freelist = NULL; /* free list of lio head structures */
+aio_lio_t *_lio_head_freelast = NULL;
+size_t lio_head_chunksize = 0;
+int _lio_alloc = 0;
+int _lio_free = 0;
+mutex_t __lio_mutex = DEFAULTMUTEX;
+
+/*
+ * Allocate a listio head structure.
+ */
+aio_lio_t *
+_aio_lio_alloc(void)
+{
+ aio_lio_t *head;
+ chunk_t *chp;
+ size_t chunksize;
+ int nelem;
+ int i;
+
+ lmutex_lock(&__lio_mutex);
+ if ((head = _lio_head_freelist) == NULL) {
+ if ((chunksize = 2 * lio_head_chunksize) == 0)
+ chunksize = INITIAL_CHUNKSIZE;
+ if ((chp = chunk_alloc(chunksize)) == NULL) {
+ lmutex_unlock(&__lio_mutex);
+ return (NULL);
+ }
+ lio_head_chunksize = chunksize;
+ _lio_head_freelist = (aio_lio_t *)(uintptr_t)(chp + 1);
+ nelem = (chunksize - sizeof (chunk_t)) / sizeof (aio_lio_t);
+ for (i = 0, head = _lio_head_freelist; i < nelem; i++, head++)
+ head->lio_next = head + 1;
+ _lio_head_freelast = head - 1;
+ _lio_head_freelast->lio_next = NULL;
+ _lio_alloc += nelem;
+ _lio_free = nelem;
+ head = _lio_head_freelist;
+ }
+ if ((_lio_head_freelist = head->lio_next) == NULL)
+ _lio_head_freelast = NULL;
+ _lio_free--;
+ lmutex_unlock(&__lio_mutex);
+
+ ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
+ head->lio_next = NULL;
+ head->lio_port = -1;
+ (void) mutex_init(&head->lio_mutex, USYNC_THREAD, NULL);
+ (void) cond_init(&head->lio_cond_cv, USYNC_THREAD, NULL);
+
+ return (head);
+}
+
+/*
+ * Free a listio head structure.
+ */
+void
+_aio_lio_free(aio_lio_t *head)
+{
+ ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
+ (void) mutex_destroy(&head->lio_mutex);
+ (void) cond_destroy(&head->lio_cond_cv);
+ (void) memset(head, 0, sizeof (*head));
+
+ lmutex_lock(&__lio_mutex);
+ if (_lio_head_freelast == NULL) {
+ _lio_head_freelist = _lio_head_freelast = head;
+ } else {
+ _lio_head_freelast->lio_next = head;
+ _lio_head_freelast = head;
+ }
+ _lio_free++;
+ lmutex_unlock(&__lio_mutex);
+}
+
+void
+postfork1_child_aio(void)
+{
+ chunk_t *chp;
+
+ /*
+ * All of the workers are gone; free their structures.
+ */
+ if (_kaio_supported != NULL) {
+ (void) munmap((void *)_kaio_supported,
+ MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t));
+ _kaio_supported = NULL;
+ }
+ if (_aio_hash != NULL) {
+ (void) munmap((void *)_aio_hash, HASHSZ * sizeof (aio_hash_t));
+ _aio_hash = NULL;
+ }
+ for (chp = chunk_list; chp != NULL; chp = chunk_list) {
+ chunk_list = chp->chunk_next;
+ (void) munmap((void *)chp, chp->chunk_size);
+ }
+
+ /*
+ * Reinitialize global variables
+ */
+
+ worker_freelist = NULL;
+ worker_freelast = NULL;
+ worker_chunksize = 0;
+ (void) mutex_init(&worker_lock, USYNC_THREAD, NULL);
+
+ _aio_freelist = NULL;
+ _aio_freelast = NULL;
+ request_chunksize = 0;
+ _aio_freelist_cnt = 0;
+ _aio_allocated_cnt = 0;
+ (void) mutex_init(&__aio_cache_lock, USYNC_THREAD, NULL);
+
+ _lio_head_freelist = NULL;
+ _lio_head_freelast = NULL;
+ lio_head_chunksize = 0;
+ _lio_alloc = 0;
+ _lio_free = 0;
+ (void) mutex_init(&__lio_mutex, USYNC_THREAD, NULL);
+
+ (void) mutex_init(&__aio_initlock, USYNC_THREAD, NULL);
+ (void) cond_init(&__aio_initcv, USYNC_THREAD, NULL);
+ __aio_initbusy = 0;
+
+ (void) mutex_init(&__aio_mutex, USYNC_THREAD, NULL);
+ (void) cond_init(&_aio_iowait_cv, USYNC_THREAD, NULL);
+ (void) cond_init(&_aio_waitn_cv, USYNC_THREAD, NULL);
+
+ _kaio_ok = 0;
+ __uaio_ok = 0;
+
+ _kaiowp = NULL;
+
+ __workers_rw = NULL;
+ __nextworker_rw = NULL;
+ __rw_workerscnt = 0;
+
+ __workers_no = NULL;
+ __nextworker_no = NULL;
+ __no_workerscnt = 0;
+
+ _aio_worker_cnt = 0;
+
+ _aio_done_head = NULL;
+ _aio_done_tail = NULL;
+ _aio_donecnt = 0;
+
+ _aio_doneq = NULL;
+ _aio_doneq_cnt = 0;
+
+ _aio_waitncnt = 0;
+ _aio_outstand_cnt = 0;
+ _kaio_outstand_cnt = 0;
+ _aio_req_done_cnt = 0;
+ _aio_kernel_suspend = 0;
+ _aio_suscv_cnt = 0;
+
+ _aiowait_flag = 0;
+ _aio_flags = 0;
+}
+
+#define DISPLAY(var) \
+ (void) fprintf(stderr, #var "\t= %d\n", var)
+
+static void
+_aio_exit_info(void)
+{
+ if ((_kaio_ok | __uaio_ok) == 0)
+ return;
+ (void) fprintf(stderr, "\n");
+ DISPLAY(_aio_freelist_cnt);
+ DISPLAY(_aio_allocated_cnt);
+ DISPLAY(_lio_alloc);
+ DISPLAY(_lio_free);
+ DISPLAY(__rw_workerscnt);
+ DISPLAY(__no_workerscnt);
+ DISPLAY(_aio_worker_cnt);
+ DISPLAY(_aio_donecnt);
+ DISPLAY(_aio_doneq_cnt);
+ DISPLAY(_aio_waitncnt);
+ DISPLAY(_aio_outstand_cnt);
+ DISPLAY(_kaio_outstand_cnt);
+ DISPLAY(_aio_req_done_cnt);
+ DISPLAY(_aio_kernel_suspend);
+ DISPLAY(_aio_suscv_cnt);
+ DISPLAY(_aiowait_flag);
+ DISPLAY(_aio_flags);
+}
+
+void
+init_aio(void)
+{
+ char *str;
+
+ (void) pthread_key_create(&_aio_key, _aio_worker_free);
+ if ((str = getenv("_AIO_MIN_WORKERS")) != NULL) {
+ if ((_min_workers = atoi(str)) <= 0)
+ _min_workers = 4;
+ }
+ if ((str = getenv("_AIO_MAX_WORKERS")) != NULL) {
+ if ((_max_workers = atoi(str)) <= 0)
+ _max_workers = 256;
+ if (_max_workers < _min_workers + 1)
+ _max_workers = _min_workers + 1;
+ }
+ if ((str = getenv("_AIO_EXIT_INFO")) != NULL && atoi(str) != 0)
+ (void) atexit(_aio_exit_info);
+}
diff --git a/usr/src/lib/libc/port/aio/posix_aio.c b/usr/src/lib/libc/port/aio/posix_aio.c
new file mode 100644
index 0000000000..5e3c3ac41d
--- /dev/null
+++ b/usr/src/lib/libc/port/aio/posix_aio.c
@@ -0,0 +1,1758 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * posix_aio.c implements the POSIX async. I/O functions.
+ *
+ * aio_read
+ * aio_write
+ * aio_error
+ * aio_return
+ * aio_suspend
+ * lio_listio
+ * aio_fsync
+ * aio_cancel
+ */
+
+#include "synonyms.h"
+#include "thr_uberdata.h"
+#include "asyncio.h"
+#include <atomic.h>
+#include <sys/file.h>
+#include <sys/port.h>
+
+extern int __fdsync(int, int);
+
+cond_t _aio_waitn_cv = DEFAULTCV; /* wait for end of aio_waitn */
+
+static int _aio_check_timeout(const timespec_t *, timespec_t *, int *);
+
+/* defines for timedwait in __aio_waitn() and __aio_suspend() */
+#define AIO_TIMEOUT_INDEF -1
+#define AIO_TIMEOUT_POLL 0
+#define AIO_TIMEOUT_WAIT 1
+#define AIO_TIMEOUT_UNDEF 2
+
+/*
+ * List I/O stuff
+ */
+static void _lio_list_decr(aio_lio_t *);
+static long aio_list_max = 0;
+
+int
+aio_read(aiocb_t *aiocbp)
+{
+ if (aiocbp == NULL || aiocbp->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (_aio_sigev_thread(aiocbp) != 0)
+ return (-1);
+ aiocbp->aio_lio_opcode = LIO_READ;
+ return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAREAD,
+ (AIO_KAIO | AIO_NO_DUPS)));
+}
+
+int
+aio_write(aiocb_t *aiocbp)
+{
+ if (aiocbp == NULL || aiocbp->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (_aio_sigev_thread(aiocbp) != 0)
+ return (-1);
+ aiocbp->aio_lio_opcode = LIO_WRITE;
+ return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAWRITE,
+ (AIO_KAIO | AIO_NO_DUPS)));
+}
+
+/*
+ * __lio_listio() cancellation handler.
+ */
+/* ARGSUSED */
+static void
+_lio_listio_cleanup(aio_lio_t *head)
+{
+ int freeit = 0;
+
+ ASSERT(MUTEX_HELD(&head->lio_mutex));
+ if (head->lio_refcnt == 0) {
+ ASSERT(head->lio_nent == 0);
+ freeit = 1;
+ }
+ head->lio_waiting = 0;
+ sig_mutex_unlock(&head->lio_mutex);
+ if (freeit)
+ _aio_lio_free(head);
+}
+
+int
+lio_listio(int mode, aiocb_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list,
+ int nent, struct sigevent *_RESTRICT_KYWD sigevp)
+{
+ int aio_ufs = 0;
+ int oerrno = 0;
+ aio_lio_t *head = NULL;
+ aiocb_t *aiocbp;
+ int state = 0;
+ int EIOflg = 0;
+ int rw;
+ int do_kaio = 0;
+ int error;
+ int i;
+
+ if (!_kaio_ok)
+ _kaio_init();
+
+ if (aio_list_max == 0)
+ aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
+
+ if (nent <= 0 || nent > aio_list_max) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ switch (mode) {
+ case LIO_WAIT:
+ state = NOCHECK;
+ break;
+ case LIO_NOWAIT:
+ state = CHECK;
+ break;
+ default:
+ errno = EINVAL;
+ return (-1);
+ }
+
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = list[i]) == NULL)
+ continue;
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (_aio_sigev_thread(aiocbp) != 0)
+ return (-1);
+ if (aiocbp->aio_lio_opcode == LIO_NOP)
+ aiocbp->aio_state = NOCHECK;
+ else {
+ aiocbp->aio_state = state;
+ if (KAIO_SUPPORTED(aiocbp->aio_fildes))
+ do_kaio++;
+ else
+ aiocbp->aio_resultp.aio_errno = ENOTSUP;
+ }
+ }
+ if (_aio_sigev_thread_init(sigevp) != 0)
+ return (-1);
+
+ if (do_kaio) {
+ error = (int)_kaio(AIOLIO, mode, list, nent, sigevp);
+ if (error == 0)
+ return (0);
+ oerrno = errno;
+ } else {
+ oerrno = errno = ENOTSUP;
+ error = -1;
+ }
+
+ if (error == -1 && errno == ENOTSUP) {
+ error = errno = 0;
+ /*
+ * If LIO_WAIT, or notification required, allocate a list head.
+ */
+ if (mode == LIO_WAIT ||
+ (sigevp != NULL &&
+ (sigevp->sigev_notify == SIGEV_SIGNAL ||
+ sigevp->sigev_notify == SIGEV_THREAD ||
+ sigevp->sigev_notify == SIGEV_PORT)))
+ head = _aio_lio_alloc();
+ if (head) {
+ sig_mutex_lock(&head->lio_mutex);
+ head->lio_mode = mode;
+ head->lio_largefile = 0;
+ if (mode == LIO_NOWAIT && sigevp != NULL) {
+ if (sigevp->sigev_notify == SIGEV_THREAD) {
+ head->lio_port = sigevp->sigev_signo;
+ head->lio_event = AIOLIO;
+ head->lio_sigevent = sigevp;
+ head->lio_sigval.sival_ptr =
+ sigevp->sigev_value.sival_ptr;
+ } else if (sigevp->sigev_notify == SIGEV_PORT) {
+ port_notify_t *pn =
+ sigevp->sigev_value.sival_ptr;
+ head->lio_port = pn->portnfy_port;
+ head->lio_event = AIOLIO;
+ head->lio_sigevent = sigevp;
+ head->lio_sigval.sival_ptr =
+ pn->portnfy_user;
+ } else { /* SIGEV_SIGNAL */
+ head->lio_signo = sigevp->sigev_signo;
+ head->lio_sigval.sival_ptr =
+ sigevp->sigev_value.sival_ptr;
+ }
+ }
+ head->lio_nent = head->lio_refcnt = nent;
+ sig_mutex_unlock(&head->lio_mutex);
+ }
+ /*
+ * find UFS requests, errno == ENOTSUP/EBADFD,
+ */
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = list[i]) == NULL ||
+ aiocbp->aio_lio_opcode == LIO_NOP ||
+ (aiocbp->aio_resultp.aio_errno != ENOTSUP &&
+ aiocbp->aio_resultp.aio_errno != EBADFD)) {
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ if (aiocbp->aio_resultp.aio_errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
+ if (aiocbp->aio_reqprio < 0) {
+ aiocbp->aio_resultp.aio_errno = EINVAL;
+ aiocbp->aio_resultp.aio_return = -1;
+ EIOflg = 1;
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ /*
+ * submit an AIO request with flags AIO_NO_KAIO
+ * to avoid the kaio() syscall in _aio_rw()
+ */
+ switch (aiocbp->aio_lio_opcode) {
+ case LIO_READ:
+ rw = AIOAREAD;
+ break;
+ case LIO_WRITE:
+ rw = AIOAWRITE;
+ break;
+ }
+ error = _aio_rw(aiocbp, head, &__nextworker_rw, rw,
+ (AIO_NO_KAIO | AIO_NO_DUPS));
+ if (error == 0)
+ aio_ufs++;
+ else {
+ if (head)
+ _lio_list_decr(head);
+ aiocbp->aio_resultp.aio_errno = error;
+ EIOflg = 1;
+ }
+ }
+ }
+ if (EIOflg) {
+ errno = EIO;
+ return (-1);
+ }
+ if (mode == LIO_WAIT && oerrno == ENOTSUP) {
+ /*
+ * call kaio(AIOLIOWAIT) to get all outstanding
+ * kernel AIO requests
+ */
+ if ((nent - aio_ufs) > 0)
+ (void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp);
+ if (head != NULL && head->lio_nent > 0) {
+ sig_mutex_lock(&head->lio_mutex);
+ while (head->lio_refcnt > 0) {
+ int err;
+ head->lio_waiting = 1;
+ pthread_cleanup_push(_lio_listio_cleanup, head);
+ err = sig_cond_wait(&head->lio_cond_cv,
+ &head->lio_mutex);
+ pthread_cleanup_pop(0);
+ head->lio_waiting = 0;
+ if (err && head->lio_nent > 0) {
+ sig_mutex_unlock(&head->lio_mutex);
+ errno = err;
+ return (-1);
+ }
+ }
+ sig_mutex_unlock(&head->lio_mutex);
+ ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
+ _aio_lio_free(head);
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = list[i]) != NULL &&
+ aiocbp->aio_resultp.aio_errno) {
+ errno = EIO;
+ return (-1);
+ }
+ }
+ }
+ return (0);
+ }
+ return (error);
+}
+
+static void
+_lio_list_decr(aio_lio_t *head)
+{
+ sig_mutex_lock(&head->lio_mutex);
+ head->lio_nent--;
+ head->lio_refcnt--;
+ sig_mutex_unlock(&head->lio_mutex);
+}
+
+/*
+ * __aio_suspend() cancellation handler.
+ */
+/* ARGSUSED */
+static void
+_aio_suspend_cleanup(int *counter)
+{
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ (*counter)--; /* _aio_kernel_suspend or _aio_suscv_cnt */
+ sig_mutex_unlock(&__aio_mutex);
+}
+
+static int
+__aio_suspend(void **list, int nent, const timespec_t *timo, int largefile)
+{
+ int cv_err; /* error code from cond_xxx() */
+ int kerr; /* error code from _kaio(AIOSUSPEND) */
+ int i;
+ timespec_t twait; /* copy of timo for internal calculations */
+ timespec_t *wait = NULL;
+ int timedwait;
+ int req_outstanding;
+ aiocb_t **listp;
+ aiocb_t *aiocbp;
+#if !defined(_LP64)
+ aiocb64_t **listp64;
+ aiocb64_t *aiocbp64;
+#endif
+ hrtime_t hrtstart;
+ hrtime_t hrtend;
+ hrtime_t hrtres;
+
+#if defined(_LP64)
+ if (largefile)
+ aio_panic("__aio_suspend: largefile set when _LP64 defined");
+#endif
+
+ if (nent <= 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if (timo) {
+ if (timo->tv_sec < 0 || timo->tv_nsec < 0 ||
+ timo->tv_nsec >= NANOSEC) {
+ errno = EINVAL;
+ return (-1);
+ }
+ /* Initialize start time if time monitoring desired */
+ if (timo->tv_sec > 0 || timo->tv_nsec > 0) {
+ timedwait = AIO_TIMEOUT_WAIT;
+ hrtstart = gethrtime();
+ } else {
+ /* content of timeout = 0 : polling */
+ timedwait = AIO_TIMEOUT_POLL;
+ }
+ } else {
+ /* timeout pointer = NULL : wait indefinitely */
+ timedwait = AIO_TIMEOUT_INDEF;
+ }
+
+#if !defined(_LP64)
+ if (largefile) {
+ listp64 = (aiocb64_t **)list;
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp64 = listp64[i]) != NULL &&
+ aiocbp64->aio_state == CHECK)
+ aiocbp64->aio_state = CHECKED;
+ }
+ } else
+#endif /* !_LP64 */
+ {
+ listp = (aiocb_t **)list;
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = listp[i]) != NULL &&
+ aiocbp->aio_state == CHECK)
+ aiocbp->aio_state = CHECKED;
+ }
+ }
+
+ sig_mutex_lock(&__aio_mutex);
+
+ /*
+ * The next "if -case" is required to accelerate the
+ * access to completed RAW-IO requests.
+ */
+ if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
+ /* Only kernel requests pending */
+
+ /*
+ * _aio_kernel_suspend is used to detect completed non RAW-IO
+ * requests.
+ * As long as this thread resides in the kernel (_kaio) further
+ * asynchronous non RAW-IO requests could be submitted.
+ */
+ _aio_kernel_suspend++;
+
+ /*
+ * Always do the kaio() call without using the KAIO_SUPPORTED()
+ * checks because it is not mandatory to have a valid fd
+ * set in the list entries, only the resultp must be set.
+ *
+ * _kaio(AIOSUSPEND ...) return values :
+ * 0: everythink ok, completed request found
+ * -1: error
+ * 1: no error : _aiodone awaked the _kaio(AIOSUSPEND,,)
+ * system call using _kaio(AIONOTIFY). It means, that some
+ * non RAW-IOs completed inbetween.
+ */
+
+ pthread_cleanup_push(_aio_suspend_cleanup,
+ &_aio_kernel_suspend);
+ pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
+ sig_mutex_unlock(&__aio_mutex);
+ _cancel_prologue();
+ kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND,
+ list, nent, timo, -1);
+ _cancel_epilogue();
+ pthread_cleanup_pop(1); /* sig_mutex_lock(&__aio_mutex) */
+ pthread_cleanup_pop(0);
+
+ _aio_kernel_suspend--;
+
+ if (!kerr) {
+ sig_mutex_unlock(&__aio_mutex);
+ return (0);
+ }
+ } else {
+ kerr = 1; /* simulation: _kaio detected AIONOTIFY */
+ }
+
+ /*
+ * Return kernel error code if no other IOs are outstanding.
+ */
+ req_outstanding = _aio_doneq_cnt + _aio_outstand_cnt;
+
+ sig_mutex_unlock(&__aio_mutex);
+
+ if (req_outstanding == 0) {
+ /* no IOs outstanding in the thread pool */
+ if (kerr == 1)
+ /* return "no IOs completed" */
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * IOs using the thread pool are outstanding.
+ */
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ /* time monitoring */
+ hrtend = hrtstart + (hrtime_t)timo->tv_sec * (hrtime_t)NANOSEC +
+ (hrtime_t)timo->tv_nsec;
+ hrtres = hrtend - gethrtime();
+ if (hrtres <= 0)
+ hrtres = 1;
+ twait.tv_sec = hrtres / (hrtime_t)NANOSEC;
+ twait.tv_nsec = hrtres % (hrtime_t)NANOSEC;
+ wait = &twait;
+ } else if (timedwait == AIO_TIMEOUT_POLL) {
+ twait = *timo; /* content of timo = 0 : polling */
+ wait = &twait;
+ }
+
+ for (;;) {
+ int error;
+ int inprogress;
+
+ /* first scan file system requests */
+ inprogress = 0;
+ for (i = 0; i < nent; i++) {
+#if !defined(_LP64)
+ if (largefile) {
+ if ((aiocbp64 = listp64[i]) == NULL)
+ continue;
+ error = aiocbp64->aio_resultp.aio_errno;
+ } else
+#endif
+ {
+ if ((aiocbp = listp[i]) == NULL)
+ continue;
+ error = aiocbp->aio_resultp.aio_errno;
+ }
+ if (error == EINPROGRESS)
+ inprogress = 1;
+ else if (error != ECANCELED) {
+ errno = 0;
+ return (0);
+ }
+ }
+
+ sig_mutex_lock(&__aio_mutex);
+
+ /*
+ * If there aren't outstanding I/Os in the thread pool then
+ * we have to return here, provided that all kernel RAW-IOs
+ * also completed.
+ * If the kernel was notified to return, then we have to check
+ * possible pending RAW-IOs.
+ */
+ if (_aio_outstand_cnt == 0 && inprogress == 0 && kerr != 1) {
+ sig_mutex_unlock(&__aio_mutex);
+ errno = EAGAIN;
+ break;
+ }
+
+ /*
+ * There are outstanding IOs in the thread pool or the kernel
+ * was notified to return.
+ * Check pending RAW-IOs first.
+ */
+ if (kerr == 1) {
+ /*
+ * _aiodone just notified the kernel about
+ * completed non RAW-IOs (AIONOTIFY was detected).
+ */
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ /* Update remaining timeout for the kernel */
+ hrtres = hrtend - gethrtime();
+ if (hrtres <= 0) {
+ /* timer expired */
+ sig_mutex_unlock(&__aio_mutex);
+ errno = EAGAIN;
+ break;
+ }
+ wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
+ wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
+ }
+ _aio_kernel_suspend++;
+
+ pthread_cleanup_push(_aio_suspend_cleanup,
+ &_aio_kernel_suspend);
+ pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
+ sig_mutex_unlock(&__aio_mutex);
+ _cancel_prologue();
+ kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND,
+ list, nent, wait, -1);
+ _cancel_epilogue();
+ pthread_cleanup_pop(1);
+ pthread_cleanup_pop(0);
+
+ _aio_kernel_suspend--;
+
+ if (!kerr) {
+ sig_mutex_unlock(&__aio_mutex);
+ return (0);
+ }
+ }
+
+ if (timedwait == AIO_TIMEOUT_POLL) {
+ sig_mutex_unlock(&__aio_mutex);
+ errno = EAGAIN;
+ break;
+ }
+
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ /* Update remaining timeout */
+ hrtres = hrtend - gethrtime();
+ if (hrtres <= 0) {
+ /* timer expired */
+ sig_mutex_unlock(&__aio_mutex);
+ errno = EAGAIN;
+ break;
+ }
+ wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
+ wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
+ }
+
+ if (_aio_outstand_cnt == 0) {
+ sig_mutex_unlock(&__aio_mutex);
+ continue;
+ }
+
+ _aio_suscv_cnt++; /* ID for _aiodone (wake up) */
+
+ pthread_cleanup_push(_aio_suspend_cleanup, &_aio_suscv_cnt);
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ cv_err = sig_cond_reltimedwait(&_aio_iowait_cv,
+ &__aio_mutex, wait);
+ if (cv_err == ETIME)
+ cv_err = EAGAIN;
+ } else {
+ /* wait indefinitely */
+ cv_err = sig_cond_wait(&_aio_iowait_cv, &__aio_mutex);
+ }
+ /* this decrements _aio_suscv_cnt and drops __aio_mutex */
+ pthread_cleanup_pop(1);
+
+ if (cv_err) {
+ errno = cv_err;
+ break;
+ }
+ }
+ return (-1);
+}
+
+int
+aio_suspend(const aiocb_t * const list[], int nent,
+ const timespec_t *timeout)
+{
+ return (__aio_suspend((void **)list, nent, timeout, 0));
+}
+
+int
+aio_error(const aiocb_t *aiocbp)
+{
+ const aio_result_t *resultp = &aiocbp->aio_resultp;
+ int error;
+
+ if ((error = resultp->aio_errno) == EINPROGRESS) {
+ if (aiocbp->aio_state == CHECK) {
+ /*
+ * Always do the kaio() call without using the
+ * KAIO_SUPPORTED() checks because it is not
+ * mandatory to have a valid fd set in the
+ * aiocb, only the resultp must be set.
+ */
+ if ((int)_kaio(AIOERROR, aiocbp) == EINVAL) {
+ errno = EINVAL;
+ return (-1);
+ }
+ error = resultp->aio_errno;
+ } else if (aiocbp->aio_state == CHECKED) {
+ ((aiocb_t *)aiocbp)->aio_state = CHECK;
+ }
+ }
+ return (error);
+}
+
+ssize_t
+aio_return(aiocb_t *aiocbp)
+{
+ aio_result_t *resultp = &aiocbp->aio_resultp;
+ aio_req_t *reqp;
+ int error;
+ ssize_t retval;
+
+ /*
+ * The _aiodone() function stores resultp->aio_return before
+ * storing resultp->aio_errno (with an membar_producer() in
+ * between). We use membar_consumer() below to ensure proper
+ * memory ordering between _aiodone() and ourself.
+ */
+ error = resultp->aio_errno;
+ membar_consumer();
+ retval = resultp->aio_return;
+
+ /*
+ * we use this condition to indicate either that
+ * aio_return() has been called before or should
+ * not have been called yet.
+ */
+ if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) {
+ errno = error;
+ return (-1);
+ }
+
+ /*
+ * Before we return, mark the result as being returned so that later
+ * calls to aio_return() will return the fact that the result has
+ * already been returned.
+ */
+ sig_mutex_lock(&__aio_mutex);
+ /* retest, in case more than one thread actually got in here */
+ if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) {
+ sig_mutex_unlock(&__aio_mutex);
+ errno = EINVAL;
+ return (-1);
+ }
+ resultp->aio_return = -1;
+ resultp->aio_errno = EINVAL;
+ if ((reqp = _aio_hash_del(resultp)) == NULL)
+ sig_mutex_unlock(&__aio_mutex);
+ else {
+ aiocbp->aio_state = NOCHECK;
+ ASSERT(reqp->req_head == NULL);
+ (void) _aio_req_remove(reqp);
+ sig_mutex_unlock(&__aio_mutex);
+ _aio_req_free(reqp);
+ }
+
+ if (retval == -1)
+ errno = error;
+ return (retval);
+}
+
+void
+_lio_remove(aio_req_t *reqp)
+{
+ aio_lio_t *head;
+ int refcnt;
+
+ if ((head = reqp->req_head) != NULL) {
+ sig_mutex_lock(&head->lio_mutex);
+ ASSERT(head->lio_refcnt == head->lio_nent);
+ refcnt = --head->lio_nent;
+ head->lio_refcnt--;
+ sig_mutex_unlock(&head->lio_mutex);
+ if (refcnt == 0)
+ _aio_lio_free(head);
+ reqp->req_head = NULL;
+ }
+}
+
+/*
+ * This function returns the number of asynchronous I/O requests submitted.
+ */
+static int
+__aio_fsync_bar(aiocb_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp,
+ int workerscnt)
+{
+ int i;
+ int error;
+ aio_worker_t *next = aiowp;
+
+ for (i = 0; i < workerscnt; i++) {
+ error = _aio_rw(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO);
+ if (error != 0) {
+ sig_mutex_lock(&head->lio_mutex);
+ head->lio_mode = LIO_DESTROY; /* ignore fsync */
+ head->lio_nent -= workerscnt - i;
+ head->lio_refcnt -= workerscnt - i;
+ sig_mutex_unlock(&head->lio_mutex);
+ errno = EAGAIN;
+ return (i);
+ }
+ next = next->work_forw;
+ }
+ return (i);
+}
+
+int
+aio_fsync(int op, aiocb_t *aiocbp)
+{
+ aio_lio_t *head;
+ struct stat statb;
+ int fret;
+
+ if (aiocbp == NULL)
+ return (0);
+ if (aiocbp->aio_reqprio < 0 || (op != O_DSYNC && op != O_SYNC)) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (fstat(aiocbp->aio_fildes, &statb) < 0)
+ return (-1);
+ if (_aio_sigev_thread(aiocbp) != 0)
+ return (-1);
+
+ /*
+ * Kernel aio_fsync() is not supported.
+ * We force user-level aio_fsync() just
+ * for the notification side-effect.
+ */
+ if (!__uaio_ok && __uaio_init() == -1)
+ return (-1);
+
+ /*
+ * The first asynchronous I/O request in the current process will
+ * create a bunch of workers (via __uaio_init()). If the number
+ * of workers is zero then the number of pending asynchronous I/O
+ * requests is zero. In such a case only execute the standard
+ * fsync(3C) or fdatasync(3RT) as appropriate.
+ */
+ if (__rw_workerscnt == 0) {
+ if (op == O_DSYNC)
+ return (__fdsync(aiocbp->aio_fildes, FDSYNC));
+ else
+ return (__fdsync(aiocbp->aio_fildes, FSYNC));
+ }
+
+ /*
+ * re-use aio_offset as the op field.
+ * O_DSYNC - fdatasync()
+ * O_SYNC - fsync()
+ */
+ aiocbp->aio_offset = op;
+ aiocbp->aio_lio_opcode = AIOFSYNC;
+
+ /*
+ * Create a list of fsync requests. The worker that
+ * gets the last request will do the fsync request.
+ */
+ head = _aio_lio_alloc();
+ if (head == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+ head->lio_mode = LIO_FSYNC;
+ head->lio_nent = head->lio_refcnt = __rw_workerscnt;
+ head->lio_largefile = 0;
+
+ /*
+ * Insert an fsync request on every worker's queue.
+ */
+ fret = __aio_fsync_bar(aiocbp, head, __workers_rw, __rw_workerscnt);
+ if (fret != __rw_workerscnt) {
+ /*
+ * Fewer fsync requests than workers means that it was
+ * not possible to submit fsync requests to all workers.
+ * Actions:
+ * a) number of fsync requests submitted is 0:
+ * => free allocated memory (aio_lio_t).
+ * b) number of fsync requests submitted is > 0:
+ * => the last worker executing the fsync request
+ * will free the aio_lio_t struct.
+ */
+ if (fret == 0)
+ _aio_lio_free(head);
+ return (-1);
+ }
+ return (0);
+}
+
+int
+aio_cancel(int fd, aiocb_t *aiocbp)
+{
+ aio_req_t *reqp;
+ aio_worker_t *aiowp;
+ int done = 0;
+ int canceled = 0;
+ struct stat buf;
+
+ if (fstat(fd, &buf) < 0)
+ return (-1);
+
+ if (aiocbp != NULL) {
+ if (fd != aiocbp->aio_fildes) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (aiocbp->aio_state == USERAIO) {
+ sig_mutex_lock(&__aio_mutex);
+ reqp = _aio_hash_find(&aiocbp->aio_resultp);
+ if (reqp == NULL) {
+ sig_mutex_unlock(&__aio_mutex);
+ return (AIO_ALLDONE);
+ }
+ aiowp = reqp->req_worker;
+ sig_mutex_lock(&aiowp->work_qlock1);
+ (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_unlock(&__aio_mutex);
+ if (done)
+ return (AIO_ALLDONE);
+ if (canceled)
+ return (AIO_CANCELED);
+ return (AIO_NOTCANCELED);
+ }
+ if (aiocbp->aio_state == USERAIO_DONE)
+ return (AIO_ALLDONE);
+ return ((int)_kaio(AIOCANCEL, fd, aiocbp));
+ }
+
+ return (aiocancel_all(fd));
+}
+
+/*
+ * __aio_waitn() cancellation handler.
+ */
+/* ARGSUSED */
+static void
+_aio_waitn_cleanup(void *arg)
+{
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+
+ /* check for pending aio_waitn() calls */
+ _aio_flags &= ~(AIO_LIB_WAITN | AIO_WAIT_INPROGRESS | AIO_IO_WAITING);
+ if (_aio_flags & AIO_LIB_WAITN_PENDING) {
+ _aio_flags &= ~AIO_LIB_WAITN_PENDING;
+ (void) cond_signal(&_aio_waitn_cv);
+ }
+
+ sig_mutex_unlock(&__aio_mutex);
+}
+
+/*
+ * aio_waitn can be used to reap the results of several I/O operations that
+ * were submitted asynchronously. The submission of I/Os can be done using
+ * existing POSIX interfaces: lio_listio, aio_write or aio_read.
+ * aio_waitn waits until "nwait" I/Os (supplied as a parameter) have
+ * completed and it returns the descriptors for these I/Os in "list". The
+ * maximum size of this list is given by "nent" and the actual number of I/Os
+ * completed is returned in "nwait". Otherwise aio_waitn might also
+ * return if the timeout expires. Additionally, aio_waitn returns 0 if
+ * successful or -1 if an error occurred.
+ */
+static int
+__aio_waitn(void **list, uint_t nent, uint_t *nwait, const timespec_t *utimo)
+{
+ int error = 0;
+ uint_t dnwait = 0; /* amount of requests in the waitn-done list */
+ uint_t kwaitcnt; /* expected "done" requests from kernel */
+ uint_t knentcnt; /* max. expected "done" requests from kernel */
+ int uerrno = 0;
+ int kerrno = 0; /* save errno from _kaio() call */
+ int timedwait = AIO_TIMEOUT_UNDEF;
+ aio_req_t *reqp;
+ timespec_t end;
+ timespec_t twait; /* copy of utimo for internal calculations */
+ timespec_t *wait = NULL;
+
+ if (nent == 0 || *nwait == 0 || *nwait > nent) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /*
+ * Only one running aio_waitn call per process allowed.
+ * Further calls will be blocked here until the running
+ * call finishes.
+ */
+
+ sig_mutex_lock(&__aio_mutex);
+
+ while (_aio_flags & AIO_LIB_WAITN) {
+ if (utimo && utimo->tv_sec == 0 && utimo->tv_nsec == 0) {
+ sig_mutex_unlock(&__aio_mutex);
+ *nwait = 0;
+ return (0);
+ }
+ _aio_flags |= AIO_LIB_WAITN_PENDING;
+ pthread_cleanup_push(sig_mutex_unlock, &__aio_mutex);
+ error = sig_cond_wait(&_aio_waitn_cv, &__aio_mutex);
+ pthread_cleanup_pop(0);
+ if (error != 0) {
+ sig_mutex_unlock(&__aio_mutex);
+ *nwait = 0;
+ errno = error;
+ return (-1);
+ }
+ }
+
+ pthread_cleanup_push(_aio_waitn_cleanup, NULL);
+
+ _aio_flags |= AIO_LIB_WAITN;
+
+ if (*nwait >= AIO_WAITN_MAXIOCBS) {
+ if (_aio_check_timeout(utimo, &end, &timedwait) != 0) {
+ error = -1;
+ dnwait = 0;
+ goto out;
+ }
+ if (timedwait != AIO_TIMEOUT_INDEF) {
+ twait = *utimo;
+ wait = &twait;
+ }
+ }
+
+ /*
+ * If both counters are still set to zero, then only
+ * kernel requests are currently outstanding (raw-I/Os).
+ */
+ if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
+ for (;;) {
+ kwaitcnt = *nwait - dnwait;
+ knentcnt = nent - dnwait;
+ if (knentcnt > AIO_WAITN_MAXIOCBS)
+ knentcnt = AIO_WAITN_MAXIOCBS;
+ kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
+
+ pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
+ sig_mutex_unlock(&__aio_mutex);
+ _cancel_prologue();
+ error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
+ &kwaitcnt, wait);
+ _cancel_epilogue();
+ pthread_cleanup_pop(1);
+
+ if (error == 0) {
+ dnwait += kwaitcnt;
+ if (dnwait >= *nwait ||
+ *nwait < AIO_WAITN_MAXIOCBS)
+ break;
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ error = _aio_get_timedelta(&end, wait);
+ if (error == -1) {
+ /* timer expired */
+ errno = ETIME;
+ break;
+ }
+ }
+ continue;
+ }
+ if (errno == EAGAIN) {
+ if (dnwait > 0)
+ error = 0;
+ break;
+ }
+ if (errno == ETIME || errno == EINTR) {
+ dnwait += kwaitcnt;
+ break;
+ }
+ /* fatal error */
+ break;
+ }
+
+ goto out;
+ }
+
+ /* File system I/Os outstanding ... */
+
+ if (timedwait == AIO_TIMEOUT_UNDEF) {
+ if (_aio_check_timeout(utimo, &end, &timedwait) != 0) {
+ error = -1;
+ dnwait = 0;
+ goto out;
+ }
+ if (timedwait != AIO_TIMEOUT_INDEF) {
+ twait = *utimo;
+ wait = &twait;
+ }
+ }
+
+ for (;;) {
+ uint_t sum_reqs;
+
+ /*
+ * Calculate sum of active non RAW-IO requests (sum_reqs).
+ * If the expected amount of completed requests (*nwait) is
+ * greater than the calculated sum (sum_reqs) then
+ * use _kaio to check pending RAW-IO requests.
+ */
+ sum_reqs = _aio_doneq_cnt + dnwait + _aio_outstand_cnt;
+ kwaitcnt = (*nwait > sum_reqs) ? *nwait - sum_reqs : 0;
+
+ if (kwaitcnt != 0) {
+ /* possibly some kernel I/Os outstanding */
+ knentcnt = nent - dnwait;
+ if (knentcnt > AIO_WAITN_MAXIOCBS)
+ knentcnt = AIO_WAITN_MAXIOCBS;
+ kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
+
+ _aio_flags |= AIO_WAIT_INPROGRESS;
+
+ pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
+ sig_mutex_unlock(&__aio_mutex);
+ _cancel_prologue();
+ error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
+ &kwaitcnt, wait);
+ _cancel_epilogue();
+ pthread_cleanup_pop(1);
+
+ _aio_flags &= ~AIO_WAIT_INPROGRESS;
+
+ if (error == 0) {
+ dnwait += kwaitcnt;
+ } else {
+ switch (errno) {
+ case EINVAL:
+ case EAGAIN:
+ /* don't wait for kernel I/Os */
+ kerrno = 0; /* ignore _kaio() errno */
+ *nwait = _aio_doneq_cnt +
+ _aio_outstand_cnt + dnwait;
+ error = 0;
+ break;
+ case EINTR:
+ case ETIME:
+ /* just scan for completed LIB I/Os */
+ dnwait += kwaitcnt;
+ timedwait = AIO_TIMEOUT_POLL;
+ kerrno = errno; /* save _kaio() errno */
+ error = 0;
+ break;
+ default:
+ kerrno = errno; /* save _kaio() errno */
+ break;
+ }
+ }
+ if (error)
+ break; /* fatal kernel error */
+ }
+
+ /* check completed FS requests in the "done" queue */
+
+ while (_aio_doneq_cnt && dnwait < nent) {
+ /* get done requests */
+ if ((reqp = _aio_req_remove(NULL)) != NULL) {
+ (void) _aio_hash_del(reqp->req_resultp);
+ list[dnwait++] = reqp->req_aiocbp;
+ _aio_req_mark_done(reqp);
+ _lio_remove(reqp);
+ _aio_req_free(reqp);
+ }
+ }
+
+ if (dnwait >= *nwait) {
+ /* min. requested amount of completed I/Os satisfied */
+ break;
+ }
+ if (timedwait == AIO_TIMEOUT_WAIT &&
+ (error = _aio_get_timedelta(&end, wait)) == -1) {
+ /* timer expired */
+ uerrno = ETIME;
+ break;
+ }
+
+ /*
+ * If some I/Os are outstanding and we have to wait for them,
+ * then sleep here. _aiodone() will call _aio_waitn_wakeup()
+ * to wakeup this thread as soon as the required amount of
+ * completed I/Os is done.
+ */
+ if (_aio_outstand_cnt > 0 && timedwait != AIO_TIMEOUT_POLL) {
+ /*
+ * _aio_waitn_wakeup() will wake up this thread when:
+ * - _aio_waitncnt requests are completed or
+ * - _aio_outstand_cnt becomes zero.
+ * sig_cond_reltimedwait() could also return with
+ * a timeout error (ETIME).
+ */
+ if (*nwait < _aio_outstand_cnt)
+ _aio_waitncnt = *nwait;
+ else
+ _aio_waitncnt = _aio_outstand_cnt;
+
+ _aio_flags |= AIO_IO_WAITING;
+
+ if (wait)
+ uerrno = sig_cond_reltimedwait(&_aio_iowait_cv,
+ &__aio_mutex, wait);
+ else
+ uerrno = sig_cond_wait(&_aio_iowait_cv,
+ &__aio_mutex);
+
+ _aio_flags &= ~AIO_IO_WAITING;
+
+ if (uerrno == ETIME) {
+ timedwait = AIO_TIMEOUT_POLL;
+ continue;
+ }
+ if (uerrno != 0)
+ timedwait = AIO_TIMEOUT_POLL;
+ }
+
+ if (timedwait == AIO_TIMEOUT_POLL) {
+ /* polling or timer expired */
+ break;
+ }
+ }
+
+ errno = uerrno == 0 ? kerrno : uerrno;
+ if (errno)
+ error = -1;
+ else
+ error = 0;
+
+out:
+ *nwait = dnwait;
+
+ pthread_cleanup_pop(1); /* drops __aio_mutex */
+
+ return (error);
+}
+
+int
+aio_waitn(aiocb_t *list[], uint_t nent, uint_t *nwait,
+ const timespec_t *timeout)
+{
+ return (__aio_waitn((void **)list, nent, nwait, timeout));
+}
+
+void
+_aio_waitn_wakeup(void)
+{
+ /*
+ * __aio_waitn() sets AIO_IO_WAITING to notify _aiodone() that
+ * it is waiting for completed I/Os. The number of required
+ * completed I/Os is stored into "_aio_waitncnt".
+ * aio_waitn() is woken up when
+ * - there are no further outstanding I/Os
+ * (_aio_outstand_cnt == 0) or
+ * - the expected number of I/Os has completed.
+ * Only one __aio_waitn() function waits for completed I/Os at
+ * a time.
+ *
+ * __aio_suspend() increments "_aio_suscv_cnt" to notify
+ * _aiodone() that at least one __aio_suspend() call is
+ * waiting for completed I/Os.
+ * There could be more than one __aio_suspend() function
+ * waiting for completed I/Os. Because every function should
+ * be waiting for different I/Os, _aiodone() has to wake up all
+ * __aio_suspend() functions each time.
+ * Every __aio_suspend() function will compare the recently
+ * completed I/O with its own list.
+ */
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ if (_aio_flags & AIO_IO_WAITING) {
+ if (_aio_waitncnt > 0)
+ _aio_waitncnt--;
+ if (_aio_outstand_cnt == 0 || _aio_waitncnt == 0 ||
+ _aio_suscv_cnt > 0)
+ (void) cond_broadcast(&_aio_iowait_cv);
+ } else {
+ /* Wake up waiting aio_suspend calls */
+ if (_aio_suscv_cnt > 0)
+ (void) cond_broadcast(&_aio_iowait_cv);
+ }
+}
+
+/*
+ * timedwait values :
+ * AIO_TIMEOUT_POLL : polling
+ * AIO_TIMEOUT_WAIT : timeout
+ * AIO_TIMEOUT_INDEF : wait indefinitely
+ */
+static int
+_aio_check_timeout(const timespec_t *utimo, timespec_t *end, int *timedwait)
+{
+ struct timeval curtime;
+
+ if (utimo) {
+ if (utimo->tv_sec < 0 || utimo->tv_nsec < 0 ||
+ utimo->tv_nsec >= NANOSEC) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (utimo->tv_sec > 0 || utimo->tv_nsec > 0) {
+ (void) gettimeofday(&curtime, NULL);
+ end->tv_sec = utimo->tv_sec + curtime.tv_sec;
+ end->tv_nsec = utimo->tv_nsec + 1000 * curtime.tv_usec;
+ if (end->tv_nsec >= NANOSEC) {
+ end->tv_nsec -= NANOSEC;
+ end->tv_sec += 1;
+ }
+ *timedwait = AIO_TIMEOUT_WAIT;
+ } else {
+ /* polling */
+ *timedwait = AIO_TIMEOUT_POLL;
+ }
+ } else {
+ *timedwait = AIO_TIMEOUT_INDEF; /* wait indefinitely */
+ }
+ return (0);
+}
+
+#if !defined(_LP64)
+
+int
+aio_read64(aiocb64_t *aiocbp)
+{
+ if (aiocbp == NULL || aiocbp->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (_aio_sigev_thread64(aiocbp) != 0)
+ return (-1);
+ aiocbp->aio_lio_opcode = LIO_READ;
+ return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAREAD64,
+ (AIO_KAIO | AIO_NO_DUPS)));
+}
+
+int
+aio_write64(aiocb64_t *aiocbp)
+{
+ if (aiocbp == NULL || aiocbp->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (_aio_sigev_thread64(aiocbp) != 0)
+ return (-1);
+ aiocbp->aio_lio_opcode = LIO_WRITE;
+ return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAWRITE64,
+ (AIO_KAIO | AIO_NO_DUPS)));
+}
+
+int
+lio_listio64(int mode, aiocb64_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list,
+ int nent, struct sigevent *_RESTRICT_KYWD sigevp)
+{
+ int aio_ufs = 0;
+ int oerrno = 0;
+ aio_lio_t *head = NULL;
+ aiocb64_t *aiocbp;
+ int state = 0;
+ int EIOflg = 0;
+ int rw;
+ int do_kaio = 0;
+ int error;
+ int i;
+
+ if (!_kaio_ok)
+ _kaio_init();
+
+ if (aio_list_max == 0)
+ aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
+
+ if (nent <= 0 || nent > aio_list_max) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ switch (mode) {
+ case LIO_WAIT:
+ state = NOCHECK;
+ break;
+ case LIO_NOWAIT:
+ state = CHECK;
+ break;
+ default:
+ errno = EINVAL;
+ return (-1);
+ }
+
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = list[i]) == NULL)
+ continue;
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (_aio_sigev_thread64(aiocbp) != 0)
+ return (-1);
+ if (aiocbp->aio_lio_opcode == LIO_NOP)
+ aiocbp->aio_state = NOCHECK;
+ else {
+ aiocbp->aio_state = state;
+ if (KAIO_SUPPORTED(aiocbp->aio_fildes))
+ do_kaio++;
+ else
+ aiocbp->aio_resultp.aio_errno = ENOTSUP;
+ }
+ }
+ if (_aio_sigev_thread_init(sigevp) != 0)
+ return (-1);
+
+ if (do_kaio) {
+ error = (int)_kaio(AIOLIO64, mode, list, nent, sigevp);
+ if (error == 0)
+ return (0);
+ oerrno = errno;
+ } else {
+ oerrno = errno = ENOTSUP;
+ error = -1;
+ }
+
+ if (error == -1 && errno == ENOTSUP) {
+ error = errno = 0;
+ /*
+ * If LIO_WAIT, or notification required, allocate a list head.
+ */
+ if (mode == LIO_WAIT ||
+ (sigevp != NULL &&
+ (sigevp->sigev_notify == SIGEV_SIGNAL ||
+ sigevp->sigev_notify == SIGEV_THREAD ||
+ sigevp->sigev_notify == SIGEV_PORT)))
+ head = _aio_lio_alloc();
+ if (head) {
+ sig_mutex_lock(&head->lio_mutex);
+ head->lio_mode = mode;
+ head->lio_largefile = 1;
+ if (mode == LIO_NOWAIT && sigevp != NULL) {
+ if (sigevp->sigev_notify == SIGEV_THREAD) {
+ head->lio_port = sigevp->sigev_signo;
+ head->lio_event = AIOLIO64;
+ head->lio_sigevent = sigevp;
+ head->lio_sigval.sival_ptr =
+ sigevp->sigev_value.sival_ptr;
+ } else if (sigevp->sigev_notify == SIGEV_PORT) {
+ port_notify_t *pn =
+ sigevp->sigev_value.sival_ptr;
+ head->lio_port = pn->portnfy_port;
+ head->lio_event = AIOLIO64;
+ head->lio_sigevent = sigevp;
+ head->lio_sigval.sival_ptr =
+ pn->portnfy_user;
+ } else { /* SIGEV_SIGNAL */
+ head->lio_signo = sigevp->sigev_signo;
+ head->lio_sigval.sival_ptr =
+ sigevp->sigev_value.sival_ptr;
+ }
+ }
+ head->lio_nent = head->lio_refcnt = nent;
+ sig_mutex_unlock(&head->lio_mutex);
+ }
+ /*
+ * find UFS requests, errno == ENOTSUP/EBADFD,
+ */
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = list[i]) == NULL ||
+ aiocbp->aio_lio_opcode == LIO_NOP ||
+ (aiocbp->aio_resultp.aio_errno != ENOTSUP &&
+ aiocbp->aio_resultp.aio_errno != EBADFD)) {
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ if (aiocbp->aio_resultp.aio_errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
+ if (aiocbp->aio_reqprio < 0) {
+ aiocbp->aio_resultp.aio_errno = EINVAL;
+ aiocbp->aio_resultp.aio_return = -1;
+ EIOflg = 1;
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ /*
+ * submit an AIO request with flags AIO_NO_KAIO
+ * to avoid the kaio() syscall in _aio_rw()
+ */
+ switch (aiocbp->aio_lio_opcode) {
+ case LIO_READ:
+ rw = AIOAREAD64;
+ break;
+ case LIO_WRITE:
+ rw = AIOAWRITE64;
+ break;
+ }
+ error = _aio_rw64(aiocbp, head, &__nextworker_rw, rw,
+ (AIO_NO_KAIO | AIO_NO_DUPS));
+ if (error == 0)
+ aio_ufs++;
+ else {
+ if (head)
+ _lio_list_decr(head);
+ aiocbp->aio_resultp.aio_errno = error;
+ EIOflg = 1;
+ }
+ }
+ }
+ if (EIOflg) {
+ errno = EIO;
+ return (-1);
+ }
+ if (mode == LIO_WAIT && oerrno == ENOTSUP) {
+ /*
+ * call kaio(AIOLIOWAIT) to get all outstanding
+ * kernel AIO requests
+ */
+ if ((nent - aio_ufs) > 0)
+ (void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp);
+ if (head != NULL && head->lio_nent > 0) {
+ sig_mutex_lock(&head->lio_mutex);
+ while (head->lio_refcnt > 0) {
+ int err;
+ head->lio_waiting = 1;
+ pthread_cleanup_push(_lio_listio_cleanup, head);
+ err = sig_cond_wait(&head->lio_cond_cv,
+ &head->lio_mutex);
+ pthread_cleanup_pop(0);
+ head->lio_waiting = 0;
+ if (err && head->lio_nent > 0) {
+ sig_mutex_unlock(&head->lio_mutex);
+ errno = err;
+ return (-1);
+ }
+ }
+ sig_mutex_unlock(&head->lio_mutex);
+ ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
+ _aio_lio_free(head);
+ for (i = 0; i < nent; i++) {
+ if ((aiocbp = list[i]) != NULL &&
+ aiocbp->aio_resultp.aio_errno) {
+ errno = EIO;
+ return (-1);
+ }
+ }
+ }
+ return (0);
+ }
+ return (error);
+}
+
+int
+aio_suspend64(const aiocb64_t * const list[], int nent,
+ const timespec_t *timeout)
+{
+ return (__aio_suspend((void **)list, nent, timeout, 1));
+}
+
+int
+aio_error64(const aiocb64_t *aiocbp)
+{
+ const aio_result_t *resultp = &aiocbp->aio_resultp;
+ int error;
+
+ if ((error = resultp->aio_errno) == EINPROGRESS) {
+ if (aiocbp->aio_state == CHECK) {
+ /*
+ * Always do the kaio() call without using the
+ * KAIO_SUPPORTED() checks because it is not
+ * mandatory to have a valid fd set in the
+ * aiocb, only the resultp must be set.
+ */
+ if ((int)_kaio(AIOERROR64, aiocbp) == EINVAL) {
+ errno = EINVAL;
+ return (-1);
+ }
+ error = resultp->aio_errno;
+ } else if (aiocbp->aio_state == CHECKED) {
+ ((aiocb64_t *)aiocbp)->aio_state = CHECK;
+ }
+ }
+ return (error);
+}
+
+ssize_t
+aio_return64(aiocb64_t *aiocbp)
+{
+ aio_result_t *resultp = &aiocbp->aio_resultp;
+ aio_req_t *reqp;
+ int error;
+ ssize_t retval;
+
+ /*
+ * The _aiodone() function stores resultp->aio_return before
+ * storing resultp->aio_errno (with an membar_producer() in
+ * between). We use membar_consumer() below to ensure proper
+ * memory ordering between _aiodone() and ourself.
+ */
+ error = resultp->aio_errno;
+ membar_consumer();
+ retval = resultp->aio_return;
+
+ /*
+ * we use this condition to indicate either that
+ * aio_return() has been called before or should
+ * not have been called yet.
+ */
+ if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) {
+ errno = error;
+ return (-1);
+ }
+
+ /*
+ * Before we return, mark the result as being returned so that later
+ * calls to aio_return() will return the fact that the result has
+ * already been returned.
+ */
+ sig_mutex_lock(&__aio_mutex);
+ /* retest, in case more than one thread actually got in here */
+ if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) {
+ sig_mutex_unlock(&__aio_mutex);
+ errno = EINVAL;
+ return (-1);
+ }
+ resultp->aio_return = -1;
+ resultp->aio_errno = EINVAL;
+ if ((reqp = _aio_hash_del(resultp)) == NULL)
+ sig_mutex_unlock(&__aio_mutex);
+ else {
+ aiocbp->aio_state = NOCHECK;
+ ASSERT(reqp->req_head == NULL);
+ (void) _aio_req_remove(reqp);
+ sig_mutex_unlock(&__aio_mutex);
+ _aio_req_free(reqp);
+ }
+
+ if (retval == -1)
+ errno = error;
+ return (retval);
+}
+
+static int
+__aio_fsync_bar64(aiocb64_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp,
+ int workerscnt)
+{
+ int i;
+ int error;
+ aio_worker_t *next = aiowp;
+
+ for (i = 0; i < workerscnt; i++) {
+ error = _aio_rw64(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO);
+ if (error != 0) {
+ sig_mutex_lock(&head->lio_mutex);
+ head->lio_mode = LIO_DESTROY; /* ignore fsync */
+ head->lio_nent -= workerscnt - i;
+ head->lio_refcnt -= workerscnt - i;
+ sig_mutex_unlock(&head->lio_mutex);
+ errno = EAGAIN;
+ return (i);
+ }
+ next = next->work_forw;
+ }
+ return (i);
+}
+
+int
+aio_fsync64(int op, aiocb64_t *aiocbp)
+{
+ aio_lio_t *head;
+ struct stat statb;
+ int fret;
+
+ if (aiocbp == NULL)
+ return (0);
+ if (aiocbp->aio_reqprio < 0 || (op != O_DSYNC && op != O_SYNC)) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
+ errno = EBUSY;
+ return (-1);
+ }
+ if (fstat(aiocbp->aio_fildes, &statb) < 0)
+ return (-1);
+ if (_aio_sigev_thread64(aiocbp) != 0)
+ return (-1);
+
+ /*
+ * Kernel aio_fsync() is not supported.
+ * We force user-level aio_fsync() just
+ * for the notification side-effect.
+ */
+ if (!__uaio_ok && __uaio_init() == -1)
+ return (-1);
+
+ /*
+ * The first asynchronous I/O request in the current process will
+ * create a bunch of workers (via __uaio_init()). If the number
+ * of workers is zero then the number of pending asynchronous I/O
+ * requests is zero. In such a case only execute the standard
+ * fsync(3C) or fdatasync(3RT) as appropriate.
+ */
+ if (__rw_workerscnt == 0) {
+ if (op == O_DSYNC)
+ return (__fdsync(aiocbp->aio_fildes, FDSYNC));
+ else
+ return (__fdsync(aiocbp->aio_fildes, FSYNC));
+ }
+
+ /*
+ * re-use aio_offset as the op field.
+ * O_DSYNC - fdatasync()
+ * O_SYNC - fsync()
+ */
+ aiocbp->aio_offset = op;
+ aiocbp->aio_lio_opcode = AIOFSYNC;
+
+ /*
+ * Create a list of fsync requests. The worker that
+ * gets the last request will do the fsync request.
+ */
+ head = _aio_lio_alloc();
+ if (head == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+ head->lio_mode = LIO_FSYNC;
+ head->lio_nent = head->lio_refcnt = __rw_workerscnt;
+ head->lio_largefile = 1;
+
+ /*
+ * Insert an fsync request on every worker's queue.
+ */
+ fret = __aio_fsync_bar64(aiocbp, head, __workers_rw, __rw_workerscnt);
+ if (fret != __rw_workerscnt) {
+ /*
+ * Fewer fsync requests than workers means that it was
+ * not possible to submit fsync requests to all workers.
+ * Actions:
+ * a) number of fsync requests submitted is 0:
+ * => free allocated memory (aio_lio_t).
+ * b) number of fsync requests submitted is > 0:
+ * => the last worker executing the fsync request
+ * will free the aio_lio_t struct.
+ */
+ if (fret == 0)
+ _aio_lio_free(head);
+ return (-1);
+ }
+ return (0);
+}
+
+int
+aio_cancel64(int fd, aiocb64_t *aiocbp)
+{
+ aio_req_t *reqp;
+ aio_worker_t *aiowp;
+ int done = 0;
+ int canceled = 0;
+ struct stat buf;
+
+ if (fstat(fd, &buf) < 0)
+ return (-1);
+
+ if (aiocbp != NULL) {
+ if (fd != aiocbp->aio_fildes) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (aiocbp->aio_state == USERAIO) {
+ sig_mutex_lock(&__aio_mutex);
+ reqp = _aio_hash_find(&aiocbp->aio_resultp);
+ if (reqp == NULL) {
+ sig_mutex_unlock(&__aio_mutex);
+ return (AIO_ALLDONE);
+ }
+ aiowp = reqp->req_worker;
+ sig_mutex_lock(&aiowp->work_qlock1);
+ (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
+ sig_mutex_unlock(&aiowp->work_qlock1);
+ sig_mutex_unlock(&__aio_mutex);
+ if (done)
+ return (AIO_ALLDONE);
+ if (canceled)
+ return (AIO_CANCELED);
+ return (AIO_NOTCANCELED);
+ }
+ if (aiocbp->aio_state == USERAIO_DONE)
+ return (AIO_ALLDONE);
+ return ((int)_kaio(AIOCANCEL, fd, aiocbp));
+ }
+
+ return (aiocancel_all(fd));
+}
+
+int
+aio_waitn64(aiocb64_t *list[], uint_t nent, uint_t *nwait,
+ const timespec_t *timeout)
+{
+ return (__aio_waitn((void **)list, nent, nwait, timeout));
+}
+
+#endif /* !defined(_LP64) */
diff --git a/usr/src/lib/libc/port/gen/event_port.c b/usr/src/lib/libc/port/gen/event_port.c
index 84ade99164..f4eb057dec 100644
--- a/usr/src/lib/libc/port/gen/event_port.c
+++ b/usr/src/lib/libc/port/gen/event_port.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,7 +34,6 @@
#pragma weak port_get = _port_get
#pragma weak port_getn = _port_getn
#pragma weak port_alert = _port_alert
-#pragma weak port_dispatch = _port_dispatch
#include "lint.h"
#include <sys/types.h>
@@ -128,7 +127,7 @@ _port_send(int port, int events, void *user)
}
/*
- * port_dispatch() will block if there are not resources available to
+ * _port_dispatch() will block if there are not resources available to
* satisfy the request.
*/
diff --git a/usr/src/lib/libc/port/llib-lc b/usr/src/lib/libc/port/llib-lc
index 0c213a116c..502aa4cb33 100644
--- a/usr/src/lib/libc/port/llib-lc
+++ b/usr/src/lib/libc/port/llib-lc
@@ -29,6 +29,7 @@
/* LINTLIBRARY */
/* PROTOLIB1 */
+#include <aio.h>
#include <alloca.h>
#include <atomic.h>
#include <ctype.h>
@@ -51,6 +52,7 @@
#include <locale.h>
#include <memory.h>
#include <mon.h>
+#include <mqueue.h>
#include <nan.h>
#include <ndbm.h>
#include <limits.h>
@@ -61,7 +63,9 @@
#include <rctl.h>
#include <regex.h>
#include <rpcsvc/ypclnt.h>
+#include <sched.h>
#include <search.h>
+#include <semaphore.h>
#include <setjmp.h>
#include <shadow.h>
#include <siginfo.h>
@@ -80,6 +84,7 @@
#include <synch.h>
#include <sys/acctctl.h>
#include <sys/acl.h>
+#include <sys/asynch.h>
#include <sys/byteorder.h>
#include <sys/cladm.h>
#include <sys/corectl.h>
diff --git a/usr/src/lib/libc/port/rt/clock_timer.c b/usr/src/lib/libc/port/rt/clock_timer.c
new file mode 100644
index 0000000000..8dfb35be91
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/clock_timer.c
@@ -0,0 +1,179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#pragma weak clock_getres = _clock_getres
+#pragma weak clock_gettime = _clock_gettime
+#pragma weak clock_settime = _clock_settime
+#pragma weak timer_create = _timer_create
+#pragma weak timer_delete = _timer_delete
+#pragma weak timer_getoverrun = _timer_getoverrun
+#pragma weak timer_gettime = _timer_gettime
+#pragma weak timer_settime = _timer_settime
+
+#include "synonyms.h"
+#include <time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "sigev_thread.h"
+
+/*
+ * System call wrappers found elsewhere in libc (common/sys/__clock_timer.s).
+ */
+extern int __clock_getres(clockid_t, timespec_t *);
+extern int __clock_gettime(clockid_t, timespec_t *);
+extern int __clock_settime(clockid_t, const timespec_t *);
+extern int __timer_create(clockid_t, struct sigevent *, timer_t *);
+extern int __timer_delete(timer_t);
+extern int __timer_getoverrun(timer_t);
+extern int __timer_gettime(timer_t, itimerspec_t *);
+extern int __timer_settime(timer_t, int, const itimerspec_t *, itimerspec_t *);
+
+/*
+ * Array of pointers to tcd's, indexed by timer id.
+ * No more than 'timer_max' timers can be created by any process.
+ */
+int timer_max = 0;
+thread_communication_data_t **timer_tcd;
+static pthread_once_t timer_once = PTHREAD_ONCE_INIT;
+
+static void
+timer_init(void)
+{
+ timer_max = (int)_sysconf(_SC_TIMER_MAX);
+ timer_tcd = malloc(timer_max * sizeof (*timer_tcd));
+ (void) memset(timer_tcd, 0, timer_max * sizeof (*timer_tcd));
+}
+
+int
+_clock_getres(clockid_t clock_id, timespec_t *res)
+{
+ return (__clock_getres(clock_id, res));
+}
+
+int
+_clock_gettime(clockid_t clock_id, timespec_t *tp)
+{
+ return (__clock_gettime(clock_id, tp));
+}
+
+int
+_clock_settime(clockid_t clock_id, const timespec_t *tp)
+{
+ return (__clock_settime(clock_id, tp));
+}
+
+int
+_timer_create(clockid_t clock_id, struct sigevent *sigevp, timer_t *timerid)
+{
+ struct sigevent sigevent;
+ port_notify_t port_notify;
+ thread_communication_data_t *tcdp;
+ int sigev_thread = 0;
+ int rc;
+
+ (void) pthread_once(&timer_once, timer_init);
+
+ if (sigevp != NULL &&
+ sigevp->sigev_notify == SIGEV_THREAD &&
+ sigevp->sigev_notify_function != NULL) {
+ sigev_thread = 1;
+ tcdp = setup_sigev_handler(sigevp, TIMER);
+ if (tcdp == NULL)
+ return (-1);
+ /* copy the sigevent structure so we can modify it */
+ sigevent = *sigevp;
+ sigevp = &sigevent;
+ port_notify.portnfy_port = tcdp->tcd_port;
+ port_notify.portnfy_user = NULL;
+ sigevp->sigev_value.sival_ptr = &port_notify;
+ }
+
+ rc = __timer_create(clock_id, sigevp, timerid);
+
+ if (sigev_thread) {
+ if (rc == 0) {
+ if ((rc = launch_spawner(tcdp)) != 0)
+ __timer_delete(*timerid);
+ else
+ timer_tcd[*timerid] = tcdp;
+ }
+ if (rc != 0)
+ free_sigev_handler(tcdp);
+ }
+
+ return (rc);
+}
+
+int
+_timer_delete(timer_t timerid)
+{
+ int rc;
+
+ if ((rc = del_sigev_timer(timerid)) == 0)
+ return (__timer_delete(timerid));
+ else
+ return (rc);
+}
+
+int
+_timer_getoverrun(timer_t timerid)
+{
+ return (__timer_getoverrun(timerid) + sigev_timer_getoverrun(timerid));
+}
+
+int
+_timer_gettime(timer_t timerid, itimerspec_t *value)
+{
+ return (__timer_gettime(timerid, value));
+}
+
+int
+_timer_settime(timer_t timerid, int flags, const itimerspec_t *value,
+ itimerspec_t *ovalue)
+{
+ return (__timer_settime(timerid, flags, value, ovalue));
+}
+
+/*
+ * Cleanup after fork1() in the child process.
+ */
+void
+postfork1_child_sigev_timer(void)
+{
+ thread_communication_data_t *tcdp;
+ int timer;
+
+ for (timer = 0; timer < timer_max; timer++) {
+ if ((tcdp = timer_tcd[timer]) != NULL) {
+ timer_tcd[timer] = NULL;
+ tcd_teardown(tcdp);
+ }
+ }
+}
diff --git a/usr/src/lib/libc/port/rt/fallocate.c b/usr/src/lib/libc/port/rt/fallocate.c
new file mode 100644
index 0000000000..17b9088052
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/fallocate.c
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+
+#include <stdio.h>
+
+int
+posix_fallocate(int fd, off_t offset, off_t len)
+{
+ struct flock lck;
+
+ lck.l_whence = 0;
+ lck.l_start = offset;
+ lck.l_len = len;
+ lck.l_type = F_WRLCK;
+
+ if (fcntl(fd, F_ALLOCSP, &lck) == -1) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+#if !defined(_LP64)
+
+int
+posix_fallocate64(int fd, off64_t offset, off64_t len)
+{
+ struct flock64 lck;
+
+ lck.l_whence = 0;
+ lck.l_start = offset;
+ lck.l_len = len;
+ lck.l_type = F_WRLCK;
+
+ if (fcntl(fd, F_ALLOCSP64, &lck) == -1) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+#endif
diff --git a/usr/src/lib/libc/port/rt/mqueue.c b/usr/src/lib/libc/port/rt/mqueue.c
new file mode 100644
index 0000000000..ebab58a259
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/mqueue.c
@@ -0,0 +1,1101 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#pragma weak mq_open = _mq_open
+#pragma weak mq_close = _mq_close
+#pragma weak mq_unlink = _mq_unlink
+#pragma weak mq_send = _mq_send
+#pragma weak mq_timedsend = _mq_timedsend
+#pragma weak mq_reltimedsend_np = _mq_reltimedsend_np
+#pragma weak mq_receive = _mq_receive
+#pragma weak mq_timedreceive = _mq_timedreceive
+#pragma weak mq_reltimedreceive_np = _mq_reltimedreceive_np
+#pragma weak mq_notify = _mq_notify
+#pragma weak mq_setattr = _mq_setattr
+#pragma weak mq_getattr = _mq_getattr
+
+#include "synonyms.h"
+#include "mtlib.h"
+#define _KMEMUSER
+#include <sys/param.h> /* _MQ_OPEN_MAX, _MQ_PRIO_MAX, _SEM_VALUE_MAX */
+#undef _KMEMUSER
+#include <mqueue.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <pthread.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include "sigev_thread.h"
+#include "pos4obj.h"
+
+/*
+ * Default values per message queue
+ */
+#define MQ_MAXMSG 128
+#define MQ_MAXSIZE 1024
+
+#define MQ_MAGIC 0x4d534751 /* "MSGQ" */
+
+/*
+ * Message header which is part of messages in link list
+ */
+typedef struct {
+ uint64_t msg_next; /* offset of next message in the link */
+ uint64_t msg_len; /* length of the message */
+} msghdr_t;
+
+/*
+ * message queue description
+ */
+struct mq_dn {
+ size_t mqdn_flags; /* open description flags */
+};
+
+/*
+ * message queue descriptor structure
+ */
+typedef struct mq_des {
+ struct mq_des *mqd_next; /* list of all open mq descriptors, */
+ struct mq_des *mqd_prev; /* needed for fork-safety */
+ int mqd_magic; /* magic # to identify mq_des */
+ int mqd_flags; /* operation flag per open */
+ struct mq_header *mqd_mq; /* address pointer of message Q */
+ struct mq_dn *mqd_mqdn; /* open description */
+ thread_communication_data_t *mqd_tcd; /* SIGEV_THREAD notification */
+} mqdes_t;
+
+/*
+ * message queue common header, part of the mmap()ed file.
+ * Since message queues may be shared between 32- and 64-bit processes,
+ * care must be taken to make sure that the elements of this structure
+ * are identical for both _LP64 and _ILP32 cases.
+ */
+typedef struct mq_header {
+ /* first field must be mq_totsize, DO NOT insert before this */
+ int64_t mq_totsize; /* total size of the Queue */
+ int64_t mq_maxsz; /* max size of each message */
+ uint32_t mq_maxmsg; /* max messages in the queue */
+ uint32_t mq_maxprio; /* maximum mqueue priority */
+ uint32_t mq_curmaxprio; /* current maximum MQ priority */
+ uint32_t mq_mask; /* priority bitmask */
+ uint64_t mq_freep; /* free message's head pointer */
+ uint64_t mq_headpp; /* pointer to head pointers */
+ uint64_t mq_tailpp; /* pointer to tail pointers */
+ signotify_id_t mq_sigid; /* notification id (3 int's) */
+ uint32_t mq_ntype; /* notification type (SIGEV_*) */
+ uint64_t mq_des; /* pointer to msg Q descriptor */
+ mutex_t mq_exclusive; /* acquire for exclusive access */
+ sem_t mq_rblocked; /* number of processes rblocked */
+ sem_t mq_notfull; /* mq_send()'s block on this */
+ sem_t mq_notempty; /* mq_receive()'s block on this */
+ sem_t mq_spawner; /* spawner thread blocks on this */
+} mqhdr_t;
+
+/*
+ * The code assumes that _MQ_OPEN_MAX == -1 or "no fixed implementation limit".
+ * If this assumption is somehow invalidated, mq_open() needs to be changed
+ * back to the old version which kept a count and enforced a limit.
+ * We make sure that this is pointed out to those changing <sys/param.h>
+ * by checking _MQ_OPEN_MAX at compile time.
+ */
+#if _MQ_OPEN_MAX != -1
+#error "mq_open() no longer enforces _MQ_OPEN_MAX and needs fixing."
+#endif
+
+#define MQ_ALIGNSIZE 8 /* 64-bit alignment */
+
+#ifdef DEBUG
+#define MQ_ASSERT(x) assert(x);
+
+#define MQ_ASSERT_PTR(_m, _p) \
+ assert((_p) != NULL && !((uintptr_t)(_p) & (MQ_ALIGNSIZE -1)) && \
+ !((uintptr_t)_m + (uintptr_t)(_p) >= (uintptr_t)_m + \
+ _m->mq_totsize));
+
+#define MQ_ASSERT_SEMVAL_LEQ(sem, val) { \
+ int _val; \
+ (void) sem_getvalue((sem), &_val); \
+ assert((_val) <= val); }
+#else
+#define MQ_ASSERT(x)
+#define MQ_ASSERT_PTR(_m, _p)
+#define MQ_ASSERT_SEMVAL_LEQ(sem, val)
+#endif
+
+#define MQ_PTR(m, n) ((msghdr_t *)((uintptr_t)m + (uintptr_t)n))
+#define HEAD_PTR(m, n) ((uint64_t *)((uintptr_t)m + \
+ (uintptr_t)m->mq_headpp + n * sizeof (uint64_t)))
+#define TAIL_PTR(m, n) ((uint64_t *)((uintptr_t)m + \
+ (uintptr_t)m->mq_tailpp + n * sizeof (uint64_t)))
+
+#define MQ_RESERVED ((mqdes_t *)-1)
+
+#define ABS_TIME 0
+#define REL_TIME 1
+
+static mutex_t mq_list_lock = DEFAULTMUTEX;
+static mqdes_t *mq_list = NULL;
+
+extern int __signotify(int cmd, siginfo_t *sigonfo, signotify_id_t *sn_id);
+
+static int
+mq_is_valid(mqdes_t *mqdp)
+{
+ /*
+ * Any use of a message queue after it was closed is
+ * undefined. But the standard strongly favours EBADF
+ * returns. Before we dereference which could be fatal,
+ * we first do some pointer sanity checks.
+ */
+ if (mqdp != NULL && mqdp != MQ_RESERVED &&
+ ((uintptr_t)mqdp & 0x7) == 0) {
+ return (mqdp->mqd_magic == MQ_MAGIC);
+ }
+
+ return (0);
+}
+
+static void
+mq_init(mqhdr_t *mqhp, size_t msgsize, ssize_t maxmsg)
+{
+ int i;
+ uint64_t temp;
+ uint64_t currentp;
+ uint64_t nextp;
+
+ /*
+ * We only need to initialize the non-zero fields. The use of
+ * ftruncate() on the message queue file assures that the
+ * pages will be zfod.
+ */
+ (void) mutex_init(&mqhp->mq_exclusive, USYNC_PROCESS, NULL);
+ (void) sem_init(&mqhp->mq_rblocked, 1, 0);
+ (void) sem_init(&mqhp->mq_notempty, 1, 0);
+ (void) sem_init(&mqhp->mq_spawner, 1, 0);
+ (void) sem_init(&mqhp->mq_notfull, 1, (uint_t)maxmsg);
+
+ mqhp->mq_maxsz = msgsize;
+ mqhp->mq_maxmsg = maxmsg;
+
+ /*
+ * As of this writing (1997), there are 32 message queue priorities.
+ * If this is to change, then the size of the mq_mask will
+ * also have to change. If DEBUG is defined, assert that
+ * _MQ_PRIO_MAX hasn't changed.
+ */
+ mqhp->mq_maxprio = _MQ_PRIO_MAX;
+#if defined(DEBUG)
+ /* LINTED always true */
+ MQ_ASSERT(sizeof (mqhp->mq_mask) * 8 >= _MQ_PRIO_MAX);
+#endif
+
+ /*
+ * Since the message queue can be mapped into different
+ * virtual address ranges by different processes, we don't
+ * keep track of pointers, only offsets into the shared region.
+ */
+ mqhp->mq_headpp = sizeof (mqhdr_t);
+ mqhp->mq_tailpp = mqhp->mq_headpp +
+ mqhp->mq_maxprio * sizeof (uint64_t);
+ mqhp->mq_freep = mqhp->mq_tailpp +
+ mqhp->mq_maxprio * sizeof (uint64_t);
+
+ currentp = mqhp->mq_freep;
+ MQ_PTR(mqhp, currentp)->msg_next = 0;
+
+ temp = (mqhp->mq_maxsz + MQ_ALIGNSIZE - 1) & ~(MQ_ALIGNSIZE - 1);
+ for (i = 1; i < mqhp->mq_maxmsg; i++) {
+ nextp = currentp + sizeof (msghdr_t) + temp;
+ MQ_PTR(mqhp, currentp)->msg_next = nextp;
+ MQ_PTR(mqhp, nextp)->msg_next = 0;
+ currentp = nextp;
+ }
+}
+
+static size_t
+mq_getmsg(mqhdr_t *mqhp, char *msgp, uint_t *msg_prio)
+{
+ uint64_t currentp;
+ msghdr_t *curbuf;
+ uint64_t *headpp;
+ uint64_t *tailpp;
+
+ MQ_ASSERT(MUTEX_HELD(&mqhp->mq_exclusive));
+
+ /*
+ * Get the head and tail pointers for the queue of maximum
+ * priority. We shouldn't be here unless there is a message for
+ * us, so it's fair to assert that both the head and tail
+ * pointers are non-NULL.
+ */
+ headpp = HEAD_PTR(mqhp, mqhp->mq_curmaxprio);
+ tailpp = TAIL_PTR(mqhp, mqhp->mq_curmaxprio);
+
+ if (msg_prio != NULL)
+ *msg_prio = mqhp->mq_curmaxprio;
+
+ currentp = *headpp;
+ MQ_ASSERT_PTR(mqhp, currentp);
+ curbuf = MQ_PTR(mqhp, currentp);
+
+ if ((*headpp = curbuf->msg_next) == NULL) {
+ /*
+ * We just nuked the last message in this priority's queue.
+ * Twiddle this priority's bit, and then find the next bit
+ * tipped.
+ */
+ uint_t prio = mqhp->mq_curmaxprio;
+
+ mqhp->mq_mask &= ~(1u << prio);
+
+ for (; prio != 0; prio--)
+ if (mqhp->mq_mask & (1u << prio))
+ break;
+ mqhp->mq_curmaxprio = prio;
+
+ *tailpp = NULL;
+ }
+
+ /*
+ * Copy the message, and put the buffer back on the free list.
+ */
+ (void) memcpy(msgp, (char *)&curbuf[1], curbuf->msg_len);
+ curbuf->msg_next = mqhp->mq_freep;
+ mqhp->mq_freep = currentp;
+
+ return (curbuf->msg_len);
+}
+
+
+static void
+mq_putmsg(mqhdr_t *mqhp, const char *msgp, ssize_t len, uint_t prio)
+{
+ uint64_t currentp;
+ msghdr_t *curbuf;
+ uint64_t *headpp;
+ uint64_t *tailpp;
+
+ MQ_ASSERT(MUTEX_HELD(&mqhp->mq_exclusive));
+
+ /*
+ * Grab a free message block, and link it in. We shouldn't
+ * be here unless there is room in the queue for us; it's
+ * fair to assert that the free pointer is non-NULL.
+ */
+ currentp = mqhp->mq_freep;
+ MQ_ASSERT_PTR(mqhp, currentp);
+ curbuf = MQ_PTR(mqhp, currentp);
+
+ /*
+ * Remove a message from the free list, and copy in the new contents.
+ */
+ mqhp->mq_freep = curbuf->msg_next;
+ curbuf->msg_next = NULL;
+ (void) memcpy((char *)&curbuf[1], msgp, len);
+ curbuf->msg_len = len;
+
+ headpp = HEAD_PTR(mqhp, prio);
+ tailpp = TAIL_PTR(mqhp, prio);
+
+ if (*tailpp == 0) {
+ /*
+ * This is the first message on this queue. Set the
+ * head and tail pointers, and tip the appropriate bit
+ * in the priority mask.
+ */
+ *headpp = currentp;
+ *tailpp = currentp;
+ mqhp->mq_mask |= (1u << prio);
+ if (prio > mqhp->mq_curmaxprio)
+ mqhp->mq_curmaxprio = prio;
+ } else {
+ MQ_ASSERT_PTR(mqhp, *tailpp);
+ MQ_PTR(mqhp, *tailpp)->msg_next = currentp;
+ *tailpp = currentp;
+ }
+}
+
+mqd_t
+_mq_open(const char *path, int oflag, /* mode_t mode, mq_attr *attr */ ...)
+{
+ va_list ap;
+ mode_t mode;
+ struct mq_attr *attr;
+ int fd;
+ int err;
+ int cr_flag = 0;
+ int locked = 0;
+ uint64_t total_size;
+ size_t msgsize;
+ ssize_t maxmsg;
+ uint64_t temp;
+ void *ptr;
+ mqdes_t *mqdp;
+ mqhdr_t *mqhp;
+ struct mq_dn *mqdnp;
+
+ if (__pos4obj_check(path) == -1)
+ return ((mqd_t)-1);
+
+ /* acquire MSGQ lock to have atomic operation */
+ if (__pos4obj_lock(path, MQ_LOCK_TYPE) < 0)
+ goto out;
+ locked = 1;
+
+ va_start(ap, oflag);
+ /* filter oflag to have READ/WRITE/CREATE modes only */
+ oflag = oflag & (O_RDONLY|O_WRONLY|O_RDWR|O_CREAT|O_EXCL|O_NONBLOCK);
+ if ((oflag & O_CREAT) != 0) {
+ mode = va_arg(ap, mode_t);
+ attr = va_arg(ap, struct mq_attr *);
+ }
+ va_end(ap);
+
+ if ((fd = __pos4obj_open(path, MQ_PERM_TYPE, oflag,
+ mode, &cr_flag)) < 0)
+ goto out;
+
+ /* closing permission file */
+ (void) __close_nc(fd);
+
+ /* Try to open/create data file */
+ if (cr_flag) {
+ cr_flag = PFILE_CREATE;
+ if (attr == NULL) {
+ maxmsg = MQ_MAXMSG;
+ msgsize = MQ_MAXSIZE;
+ } else if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0) {
+ errno = EINVAL;
+ goto out;
+ } else if (attr->mq_maxmsg > _SEM_VALUE_MAX) {
+ errno = ENOSPC;
+ goto out;
+ } else {
+ maxmsg = attr->mq_maxmsg;
+ msgsize = attr->mq_msgsize;
+ }
+
+ /* adjust for message size at word boundary */
+ temp = (msgsize + MQ_ALIGNSIZE - 1) & ~(MQ_ALIGNSIZE - 1);
+
+ total_size = sizeof (mqhdr_t) +
+ maxmsg * (temp + sizeof (msghdr_t)) +
+ 2 * _MQ_PRIO_MAX * sizeof (uint64_t);
+
+ if (total_size > SSIZE_MAX) {
+ errno = ENOSPC;
+ goto out;
+ }
+
+ /*
+ * data file is opened with read/write to those
+ * who have read or write permission
+ */
+ mode = mode | (mode & 0444) >> 1 | (mode & 0222) << 1;
+ if ((fd = __pos4obj_open(path, MQ_DATA_TYPE,
+ (O_RDWR|O_CREAT|O_EXCL), mode, &err)) < 0)
+ goto out;
+
+ cr_flag |= DFILE_CREATE | DFILE_OPEN;
+
+ /* force permissions to avoid umask effect */
+ if (fchmod(fd, mode) < 0)
+ goto out;
+
+ if (ftruncate64(fd, (off64_t)total_size) < 0)
+ goto out;
+ } else {
+ if ((fd = __pos4obj_open(path, MQ_DATA_TYPE,
+ O_RDWR, 0666, &err)) < 0)
+ goto out;
+ cr_flag = DFILE_OPEN;
+
+ /* Message queue has not been initialized yet */
+ if (read(fd, &total_size, sizeof (total_size)) !=
+ sizeof (total_size) || total_size == 0) {
+ errno = ENOENT;
+ goto out;
+ }
+
+ /* Message queue too big for this process to handle */
+ if (total_size > SSIZE_MAX) {
+ errno = EFBIG;
+ goto out;
+ }
+ }
+
+ if ((mqdp = (mqdes_t *)malloc(sizeof (mqdes_t))) == NULL) {
+ errno = ENOMEM;
+ goto out;
+ }
+ cr_flag |= ALLOC_MEM;
+
+ if ((ptr = mmap64(NULL, total_size, PROT_READ|PROT_WRITE,
+ MAP_SHARED, fd, (off64_t)0)) == MAP_FAILED)
+ goto out;
+ mqhp = ptr;
+ cr_flag |= DFILE_MMAP;
+
+ /* closing data file */
+ (void) __close_nc(fd);
+ cr_flag &= ~DFILE_OPEN;
+
+ /*
+ * create, unlink, size, mmap, and close description file
+ * all for a flag word in anonymous shared memory
+ */
+ if ((fd = __pos4obj_open(path, MQ_DSCN_TYPE, O_RDWR | O_CREAT,
+ 0666, &err)) < 0)
+ goto out;
+ cr_flag |= DFILE_OPEN;
+ (void) __pos4obj_unlink(path, MQ_DSCN_TYPE);
+ if (ftruncate64(fd, (off64_t)sizeof (struct mq_dn)) < 0)
+ goto out;
+
+ if ((ptr = mmap64(NULL, sizeof (struct mq_dn),
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, (off64_t)0)) == MAP_FAILED)
+ goto out;
+ mqdnp = ptr;
+ cr_flag |= MQDNP_MMAP;
+
+ (void) __close_nc(fd);
+ cr_flag &= ~DFILE_OPEN;
+
+ /*
+ * we follow the same strategy as filesystem open() routine,
+ * where fcntl.h flags are changed to flags defined in file.h.
+ */
+ mqdp->mqd_flags = (oflag - FOPEN) & (FREAD|FWRITE);
+ mqdnp->mqdn_flags = (oflag - FOPEN) & (FNONBLOCK);
+
+ /* new message queue requires initialization */
+ if ((cr_flag & DFILE_CREATE) != 0) {
+ /* message queue header has to be initialized */
+ mq_init(mqhp, msgsize, maxmsg);
+ mqhp->mq_totsize = total_size;
+ }
+ mqdp->mqd_mq = mqhp;
+ mqdp->mqd_mqdn = mqdnp;
+ mqdp->mqd_magic = MQ_MAGIC;
+ mqdp->mqd_tcd = NULL;
+ if (__pos4obj_unlock(path, MQ_LOCK_TYPE) == 0) {
+ lmutex_lock(&mq_list_lock);
+ mqdp->mqd_next = mq_list;
+ mqdp->mqd_prev = NULL;
+ if (mq_list)
+ mq_list->mqd_prev = mqdp;
+ mq_list = mqdp;
+ lmutex_unlock(&mq_list_lock);
+ return ((mqd_t)mqdp);
+ }
+
+ locked = 0; /* fall into the error case */
+out:
+ err = errno;
+ if ((cr_flag & DFILE_OPEN) != 0)
+ (void) __close_nc(fd);
+ if ((cr_flag & DFILE_CREATE) != 0)
+ (void) __pos4obj_unlink(path, MQ_DATA_TYPE);
+ if ((cr_flag & PFILE_CREATE) != 0)
+ (void) __pos4obj_unlink(path, MQ_PERM_TYPE);
+ if ((cr_flag & ALLOC_MEM) != 0)
+ free((void *)mqdp);
+ if ((cr_flag & DFILE_MMAP) != 0)
+ (void) munmap((caddr_t)mqhp, (size_t)total_size);
+ if ((cr_flag & MQDNP_MMAP) != 0)
+ (void) munmap((caddr_t)mqdnp, sizeof (struct mq_dn));
+ if (locked)
+ (void) __pos4obj_unlock(path, MQ_LOCK_TYPE);
+ errno = err;
+ return ((mqd_t)-1);
+}
+
+static void
+mq_close_cleanup(mqdes_t *mqdp)
+{
+ mqhdr_t *mqhp = mqdp->mqd_mq;
+ struct mq_dn *mqdnp = mqdp->mqd_mqdn;
+
+ /* invalidate the descriptor before freeing it */
+ mqdp->mqd_magic = 0;
+ (void) mutex_unlock(&mqhp->mq_exclusive);
+
+ lmutex_lock(&mq_list_lock);
+ if (mqdp->mqd_next)
+ mqdp->mqd_next->mqd_prev = mqdp->mqd_prev;
+ if (mqdp->mqd_prev)
+ mqdp->mqd_prev->mqd_next = mqdp->mqd_next;
+ if (mq_list == mqdp)
+ mq_list = mqdp->mqd_next;
+ lmutex_unlock(&mq_list_lock);
+
+ free(mqdp);
+ (void) munmap((caddr_t)mqdnp, sizeof (struct mq_dn));
+ (void) munmap((caddr_t)mqhp, (size_t)mqhp->mq_totsize);
+}
+
+int
+_mq_close(mqd_t mqdes)
+{
+ mqdes_t *mqdp = (mqdes_t *)mqdes;
+ mqhdr_t *mqhp;
+ thread_communication_data_t *tcdp;
+
+ if (!mq_is_valid(mqdp)) {
+ errno = EBADF;
+ return (-1);
+ }
+
+ mqhp = mqdp->mqd_mq;
+ (void) mutex_lock(&mqhp->mq_exclusive);
+
+ if (mqhp->mq_des == (uintptr_t)mqdp &&
+ mqhp->mq_sigid.sn_pid == getpid()) {
+ /* notification is set for this descriptor, remove it */
+ (void) __signotify(SN_CANCEL, NULL, &mqhp->mq_sigid);
+ mqhp->mq_ntype = 0;
+ mqhp->mq_des = 0;
+ }
+
+ pthread_cleanup_push(mq_close_cleanup, mqdp);
+ if ((tcdp = mqdp->mqd_tcd) != NULL) {
+ mqdp->mqd_tcd = NULL;
+ del_sigev_mq(tcdp); /* possible cancellation point */
+ }
+ pthread_cleanup_pop(1); /* finish in the cleanup handler */
+
+ return (0);
+}
+
+int
+_mq_unlink(const char *path)
+{
+ int err;
+
+ if (__pos4obj_check(path) < 0)
+ return (-1);
+
+ if (__pos4obj_lock(path, MQ_LOCK_TYPE) < 0) {
+ return (-1);
+ }
+
+ err = __pos4obj_unlink(path, MQ_PERM_TYPE);
+
+ if (err == 0 || (err == -1 && errno == EEXIST)) {
+ errno = 0;
+ err = __pos4obj_unlink(path, MQ_DATA_TYPE);
+ }
+
+ if (__pos4obj_unlock(path, MQ_LOCK_TYPE) < 0)
+ return (-1);
+
+ return (err);
+
+}
+
+static int
+__mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len,
+ uint_t msg_prio, const timespec_t *timeout, int abs_rel)
+{
+ mqdes_t *mqdp = (mqdes_t *)mqdes;
+ mqhdr_t *mqhp;
+ int err;
+ int notify = 0;
+
+ /*
+ * sem_*wait() does cancellation, if called.
+ * pthread_testcancel() ensures that cancellation takes place if
+ * there is a cancellation pending when mq_*send() is called.
+ */
+ pthread_testcancel();
+
+ if (!mq_is_valid(mqdp) || (mqdp->mqd_flags & FWRITE) == 0) {
+ errno = EBADF;
+ return (-1);
+ }
+
+ mqhp = mqdp->mqd_mq;
+
+ if (msg_prio >= mqhp->mq_maxprio) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (msg_len > mqhp->mq_maxsz) {
+ errno = EMSGSIZE;
+ return (-1);
+ }
+
+ if (mqdp->mqd_mqdn->mqdn_flags & O_NONBLOCK)
+ err = sem_trywait(&mqhp->mq_notfull);
+ else {
+ /*
+ * We might get cancelled here...
+ */
+ if (timeout == NULL)
+ err = sem_wait(&mqhp->mq_notfull);
+ else if (abs_rel == ABS_TIME)
+ err = sem_timedwait(&mqhp->mq_notfull, timeout);
+ else
+ err = sem_reltimedwait_np(&mqhp->mq_notfull, timeout);
+ }
+ if (err == -1) {
+ /*
+ * errno has been set to EAGAIN / EINTR / ETIMEDOUT
+ * by sem_*wait(), so we can just return.
+ */
+ return (-1);
+ }
+
+ /*
+ * By the time we're here, we know that we've got the capacity
+ * to add to the queue...now acquire the exclusive lock.
+ */
+ (void) mutex_lock(&mqhp->mq_exclusive);
+
+ /*
+ * Now determine if we want to kick the notification. POSIX
+ * requires that if a process has registered for notification,
+ * we must kick it when the queue makes an empty to non-empty
+ * transition, and there are no blocked receivers. Note that
+ * this mechanism does _not_ guarantee that the kicked process
+ * will be able to receive a message without blocking;
+ * another receiver could intervene in the meantime. Thus,
+ * the notification mechanism is inherently racy; all we can
+ * do is hope to minimize the window as much as possible.
+ * In general, we want to avoid kicking the notification when
+ * there are clearly receivers blocked. We'll determine if
+ * we want to kick the notification before the mq_putmsg(),
+ * but the actual signotify() won't be done until the message
+ * is on the queue.
+ */
+ if (mqhp->mq_sigid.sn_pid != 0) {
+ int nmessages, nblocked;
+
+ (void) sem_getvalue(&mqhp->mq_notempty, &nmessages);
+ (void) sem_getvalue(&mqhp->mq_rblocked, &nblocked);
+
+ if (nmessages == 0 && nblocked == 0)
+ notify = 1;
+ }
+
+ mq_putmsg(mqhp, msg_ptr, (ssize_t)msg_len, msg_prio);
+ (void) sem_post(&mqhp->mq_notempty);
+
+ if (notify) {
+ /* notify and also delete the registration */
+ (void) __signotify(SN_SEND, NULL, &mqhp->mq_sigid);
+ if (mqhp->mq_ntype == SIGEV_THREAD ||
+ mqhp->mq_ntype == SIGEV_PORT)
+ (void) sem_post(&mqhp->mq_spawner);
+ mqhp->mq_ntype = 0;
+ mqhp->mq_des = 0;
+ }
+
+ MQ_ASSERT_SEMVAL_LEQ(&mqhp->mq_notempty, ((int)mqhp->mq_maxmsg));
+ (void) mutex_unlock(&mqhp->mq_exclusive);
+
+ return (0);
+}
+
+int
+_mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, uint_t msg_prio)
+{
+ return (__mq_timedsend(mqdes, msg_ptr, msg_len, msg_prio,
+ NULL, ABS_TIME));
+}
+
+int
+_mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len,
+ uint_t msg_prio, const timespec_t *abs_timeout)
+{
+ return (__mq_timedsend(mqdes, msg_ptr, msg_len, msg_prio,
+ abs_timeout, ABS_TIME));
+}
+
+int
+_mq_reltimedsend_np(mqd_t mqdes, const char *msg_ptr, size_t msg_len,
+ uint_t msg_prio, const timespec_t *rel_timeout)
+{
+ return (__mq_timedsend(mqdes, msg_ptr, msg_len, msg_prio,
+ rel_timeout, REL_TIME));
+}
+
+static void
+decrement_rblocked(mqhdr_t *mqhp)
+{
+ int canstate;
+
+ (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &canstate);
+ while (sem_wait(&mqhp->mq_rblocked) == -1)
+ continue;
+ (void) pthread_setcancelstate(canstate, NULL);
+}
+
+static ssize_t
+__mq_timedreceive(mqd_t mqdes, char *msg_ptr, size_t msg_len,
+ uint_t *msg_prio, const timespec_t *timeout, int abs_rel)
+{
+ mqdes_t *mqdp = (mqdes_t *)mqdes;
+ mqhdr_t *mqhp;
+ ssize_t msg_size;
+ int err;
+
+ /*
+ * sem_*wait() does cancellation, if called.
+ * pthread_testcancel() ensures that cancellation takes place if
+ * there is a cancellation pending when mq_*receive() is called.
+ */
+ pthread_testcancel();
+
+ if (!mq_is_valid(mqdp) || (mqdp->mqd_flags & FREAD) == 0) {
+ errno = EBADF;
+ return (ssize_t)(-1);
+ }
+
+ mqhp = mqdp->mqd_mq;
+
+ if (msg_len < mqhp->mq_maxsz) {
+ errno = EMSGSIZE;
+ return (ssize_t)(-1);
+ }
+
+ /*
+ * The semaphoring scheme for mq_[timed]receive is a little hairy
+ * thanks to POSIX.1b's arcane notification mechanism. First,
+ * we try to take the common case and do a sem_trywait().
+ * If that doesn't work, and O_NONBLOCK hasn't been set,
+ * then note that we're going to sleep by incrementing the rblocked
+ * semaphore. We decrement that semaphore after waking up.
+ */
+ if (sem_trywait(&mqhp->mq_notempty) == -1) {
+ if ((mqdp->mqd_mqdn->mqdn_flags & O_NONBLOCK) != 0) {
+ /*
+ * errno has been set to EAGAIN or EINTR by
+ * sem_trywait(), so we can just return.
+ */
+ return (-1);
+ }
+ /*
+ * If we're here, then we're probably going to block...
+ * increment the rblocked semaphore. If we get
+ * cancelled, decrement_rblocked() will decrement it.
+ */
+ (void) sem_post(&mqhp->mq_rblocked);
+
+ pthread_cleanup_push(decrement_rblocked, mqhp);
+ if (timeout == NULL)
+ err = sem_wait(&mqhp->mq_notempty);
+ else if (abs_rel == ABS_TIME)
+ err = sem_timedwait(&mqhp->mq_notempty, timeout);
+ else
+ err = sem_reltimedwait_np(&mqhp->mq_notempty, timeout);
+ pthread_cleanup_pop(1);
+
+ if (err == -1) {
+ /*
+ * We took a signal or timeout while waiting
+ * on mq_notempty...
+ */
+ return (-1);
+ }
+ }
+
+ (void) mutex_lock(&mqhp->mq_exclusive);
+ msg_size = mq_getmsg(mqhp, msg_ptr, msg_prio);
+ (void) sem_post(&mqhp->mq_notfull);
+ MQ_ASSERT_SEMVAL_LEQ(&mqhp->mq_notfull, ((int)mqhp->mq_maxmsg));
+ (void) mutex_unlock(&mqhp->mq_exclusive);
+
+ return (msg_size);
+}
+
+ssize_t
+_mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, uint_t *msg_prio)
+{
+ return (__mq_timedreceive(mqdes, msg_ptr, msg_len, msg_prio,
+ NULL, ABS_TIME));
+}
+
+ssize_t
+_mq_timedreceive(mqd_t mqdes, char *msg_ptr, size_t msg_len,
+ uint_t *msg_prio, const timespec_t *abs_timeout)
+{
+ return (__mq_timedreceive(mqdes, msg_ptr, msg_len, msg_prio,
+ abs_timeout, ABS_TIME));
+}
+
+ssize_t
+_mq_reltimedreceive_np(mqd_t mqdes, char *msg_ptr, size_t msg_len,
+ uint_t *msg_prio, const timespec_t *rel_timeout)
+{
+ return (__mq_timedreceive(mqdes, msg_ptr, msg_len, msg_prio,
+ rel_timeout, REL_TIME));
+}
+
+/*
+ * Only used below, in _mq_notify().
+ * We already have a spawner thread.
+ * Verify that the attributes match; cancel it if necessary.
+ */
+static int
+cancel_if_necessary(thread_communication_data_t *tcdp,
+ const struct sigevent *sigevp)
+{
+ int do_cancel = !_pthread_attr_equal(tcdp->tcd_attrp,
+ sigevp->sigev_notify_attributes);
+
+ if (do_cancel) {
+ /*
+ * Attributes don't match, cancel the spawner thread.
+ */
+ (void) pthread_cancel(tcdp->tcd_server_id);
+ } else {
+ /*
+ * Reuse the existing spawner thread with possibly
+ * changed notification function and value.
+ */
+ tcdp->tcd_notif.sigev_notify = SIGEV_THREAD;
+ tcdp->tcd_notif.sigev_signo = 0;
+ tcdp->tcd_notif.sigev_value = sigevp->sigev_value;
+ tcdp->tcd_notif.sigev_notify_function =
+ sigevp->sigev_notify_function;
+ }
+
+ return (do_cancel);
+}
+
+int
+_mq_notify(mqd_t mqdes, const struct sigevent *sigevp)
+{
+ mqdes_t *mqdp = (mqdes_t *)mqdes;
+ mqhdr_t *mqhp;
+ thread_communication_data_t *tcdp;
+ siginfo_t mq_siginfo;
+ struct sigevent sigevent;
+ struct stat64 statb;
+ port_notify_t *pn;
+ void *userval;
+ int rval = -1;
+ int ntype;
+ int port;
+
+ if (!mq_is_valid(mqdp)) {
+ errno = EBADF;
+ return (-1);
+ }
+
+ mqhp = mqdp->mqd_mq;
+
+ (void) mutex_lock(&mqhp->mq_exclusive);
+
+ if (sigevp == NULL) { /* remove notification */
+ if (mqhp->mq_des == (uintptr_t)mqdp &&
+ mqhp->mq_sigid.sn_pid == getpid()) {
+ /* notification is set for this descriptor, remove it */
+ (void) __signotify(SN_CANCEL, NULL, &mqhp->mq_sigid);
+ if ((tcdp = mqdp->mqd_tcd) != NULL) {
+ sig_mutex_lock(&tcdp->tcd_lock);
+ if (tcdp->tcd_msg_enabled) {
+ /* cancel the spawner thread */
+ tcdp = mqdp->mqd_tcd;
+ mqdp->mqd_tcd = NULL;
+ (void) pthread_cancel(
+ tcdp->tcd_server_id);
+ }
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ }
+ mqhp->mq_ntype = 0;
+ mqhp->mq_des = 0;
+ } else {
+ /* notification is not set for this descriptor */
+ errno = EBUSY;
+ goto bad;
+ }
+ } else { /* register notification with this process */
+ switch (ntype = sigevp->sigev_notify) {
+ case SIGEV_THREAD:
+ userval = sigevp->sigev_value.sival_ptr;
+ port = -1;
+ break;
+ case SIGEV_PORT:
+ pn = sigevp->sigev_value.sival_ptr;
+ userval = pn->portnfy_user;
+ port = pn->portnfy_port;
+ if (fstat64(port, &statb) != 0 ||
+ !S_ISPORT(statb.st_mode)) {
+ errno = EBADF;
+ goto bad;
+ }
+ (void) memset(&sigevent, 0, sizeof (sigevent));
+ sigevent.sigev_notify = SIGEV_PORT;
+ sigevp = &sigevent;
+ break;
+ }
+ switch (ntype) {
+ case SIGEV_NONE:
+ mq_siginfo.si_signo = 0;
+ mq_siginfo.si_code = SI_MESGQ;
+ break;
+ case SIGEV_SIGNAL:
+ mq_siginfo.si_signo = sigevp->sigev_signo;
+ mq_siginfo.si_value = sigevp->sigev_value;
+ mq_siginfo.si_code = SI_MESGQ;
+ break;
+ case SIGEV_THREAD:
+ if ((tcdp = mqdp->mqd_tcd) != NULL &&
+ cancel_if_necessary(tcdp, sigevp))
+ mqdp->mqd_tcd = NULL;
+ /* FALLTHROUGH */
+ case SIGEV_PORT:
+ if ((tcdp = mqdp->mqd_tcd) == NULL) {
+ /* we must create a spawner thread */
+ tcdp = setup_sigev_handler(sigevp, MQ);
+ if (tcdp == NULL) {
+ errno = EBADF;
+ goto bad;
+ }
+ tcdp->tcd_msg_enabled = 0;
+ tcdp->tcd_msg_closing = 0;
+ tcdp->tcd_msg_avail = &mqhp->mq_spawner;
+ if (launch_spawner(tcdp) != 0) {
+ free_sigev_handler(tcdp);
+ goto bad;
+ }
+ mqdp->mqd_tcd = tcdp;
+ }
+ mq_siginfo.si_signo = 0;
+ mq_siginfo.si_code = SI_MESGQ;
+ break;
+ default:
+ errno = EINVAL;
+ goto bad;
+ }
+
+ /* register notification */
+ if (__signotify(SN_PROC, &mq_siginfo, &mqhp->mq_sigid) < 0)
+ goto bad;
+ mqhp->mq_ntype = ntype;
+ mqhp->mq_des = (uintptr_t)mqdp;
+ switch (ntype) {
+ case SIGEV_THREAD:
+ case SIGEV_PORT:
+ tcdp->tcd_port = port;
+ tcdp->tcd_msg_object = mqdp;
+ tcdp->tcd_msg_userval = userval;
+ sig_mutex_lock(&tcdp->tcd_lock);
+ tcdp->tcd_msg_enabled = ntype;
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ (void) cond_broadcast(&tcdp->tcd_cv);
+ break;
+ }
+ }
+
+ rval = 0; /* success */
+bad:
+ (void) mutex_unlock(&mqhp->mq_exclusive);
+ return (rval);
+}
+
+int
+_mq_setattr(mqd_t mqdes, const struct mq_attr *mqstat, struct mq_attr *omqstat)
+{
+ mqdes_t *mqdp = (mqdes_t *)mqdes;
+ mqhdr_t *mqhp;
+ uint_t flag = 0;
+
+ if (!mq_is_valid(mqdp)) {
+ errno = EBADF;
+ return (-1);
+ }
+
+ /* store current attributes */
+ if (omqstat != NULL) {
+ int count;
+
+ mqhp = mqdp->mqd_mq;
+ omqstat->mq_flags = mqdp->mqd_mqdn->mqdn_flags;
+ omqstat->mq_maxmsg = (long)mqhp->mq_maxmsg;
+ omqstat->mq_msgsize = (long)mqhp->mq_maxsz;
+ (void) sem_getvalue(&mqhp->mq_notempty, &count);
+ omqstat->mq_curmsgs = count;
+ }
+
+ /* set description attributes */
+ if ((mqstat->mq_flags & O_NONBLOCK) != 0)
+ flag = FNONBLOCK;
+ mqdp->mqd_mqdn->mqdn_flags = flag;
+
+ return (0);
+}
+
+int
+_mq_getattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+ mqdes_t *mqdp = (mqdes_t *)mqdes;
+ mqhdr_t *mqhp;
+ int count;
+
+ if (!mq_is_valid(mqdp)) {
+ errno = EBADF;
+ return (-1);
+ }
+
+ mqhp = mqdp->mqd_mq;
+
+ mqstat->mq_flags = mqdp->mqd_mqdn->mqdn_flags;
+ mqstat->mq_maxmsg = (long)mqhp->mq_maxmsg;
+ mqstat->mq_msgsize = (long)mqhp->mq_maxsz;
+ (void) sem_getvalue(&mqhp->mq_notempty, &count);
+ mqstat->mq_curmsgs = count;
+ return (0);
+}
+
+/*
+ * Cleanup after fork1() in the child process.
+ */
+void
+postfork1_child_sigev_mq(void)
+{
+ thread_communication_data_t *tcdp;
+ mqdes_t *mqdp;
+
+ for (mqdp = mq_list; mqdp; mqdp = mqdp->mqd_next) {
+ if ((tcdp = mqdp->mqd_tcd) != NULL) {
+ mqdp->mqd_tcd = NULL;
+ tcd_teardown(tcdp);
+ }
+ }
+}
diff --git a/usr/src/lib/libc/port/rt/pos4obj.c b/usr/src/lib/libc/port/rt/pos4obj.c
new file mode 100644
index 0000000000..86f5a07595
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/pos4obj.c
@@ -0,0 +1,482 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include "mtlib.h"
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <pthread.h>
+#include <thread.h>
+#include <string.h>
+#include <dirent.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <md5.h>
+#include "pos4obj.h"
+
+#define HASHSTRLEN 32
+
+static char *__pos4obj_name(const char *, const char *);
+static void __pos4obj_md5toa(unsigned char *, unsigned char *);
+static void __pos4obj_clean(char *);
+
+static char objroot[] = "/tmp/";
+static long int name_max = 0;
+
+int
+__open_nc(const char *path, int oflag, mode_t mode)
+{
+ int canstate, val;
+ struct stat64 statbuf;
+
+ /*
+ * Ensure path is not a symlink to somewhere else. This provides
+ * a modest amount of protection against easy security attacks.
+ */
+ if (lstat64(path, &statbuf) == 0) {
+ if (S_ISLNK(statbuf.st_mode)) {
+ errno = EINVAL;
+ return (-1);
+ }
+ }
+
+ (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &canstate);
+ val = open64(path, oflag, mode);
+ (void) pthread_setcancelstate(canstate, &canstate);
+
+ return (val);
+}
+
+int
+__close_nc(int fildes)
+{
+ int canstate, val;
+
+ (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &canstate);
+ val = close(fildes);
+ (void) pthread_setcancelstate(canstate, &canstate);
+
+ return (val);
+}
+
+/*
+ * This is to avoid loading libmd.so.1 unless we absolutely have to.
+ */
+typedef void (*md5_calc_t)(unsigned char *, unsigned char *, unsigned int);
+static void *md5_handle = NULL;
+static md5_calc_t real_md5_calc = NULL;
+static mutex_t md5_lock = DEFAULTMUTEX;
+
+static void
+load_md5_calc(void)
+{
+ lmutex_lock(&md5_lock);
+ if (real_md5_calc == NULL) {
+ md5_handle = dlopen("libmd.so.1", RTLD_LAZY);
+ if (md5_handle == NULL)
+ real_md5_calc = (md5_calc_t)(-1);
+ else {
+ real_md5_calc =
+ (md5_calc_t)dlsym(md5_handle, "md5_calc");
+ if (real_md5_calc == NULL) {
+ (void) dlclose(md5_handle);
+ md5_handle = NULL;
+ real_md5_calc = (md5_calc_t)(-1);
+ }
+ }
+ }
+ lmutex_unlock(&md5_lock);
+}
+
+static char *
+__pos4obj_name(const char *path, const char *type)
+{
+ int shortpath = 1;
+ int olderrno;
+ size_t len;
+ char *dfile;
+ unsigned char hashbuf[HASHSTRLEN + 1];
+ unsigned char md5_digest[MD5_DIGEST_LENGTH];
+
+ /*
+ * If the path is path_max - strlen(type) characters or less,
+ * the name of the file to use will be the path prefixed by
+ * the type.
+ *
+ * In the special case where the path is longer than
+ * path_max - strlen(type) characters, we create a string based on the
+ * MD5 hash of the path. We prefix that string with a '.' to
+ * make it obscure, and create a directory in objroot with
+ * that name. In that directory, we create a directory named
+ * after the type of object requested. Inside the type
+ * directory, the filename will be the path of the object. This
+ * prevents collisions in all namespaces.
+ *
+ * Example:
+ * Let objroot = "/tmp/", path = "/<longpath>", and type = ".MQD"
+ * Let the MD5 hash of "<longpath>" = "<hash>"
+ *
+ * The desired file is /tmp/.<hash>/.MQD/<longpath>
+ */
+
+ /*
+ * Do not include the leading '/' in the path length.
+ * Assumes __pos4obj_check(path) has already been called.
+ */
+ if ((strlen(path) - 1) > (name_max - strlen(type)))
+ shortpath = 0;
+
+ if (shortpath) {
+ /*
+ * strlen(path) includes leading slash as space for NUL.
+ */
+ len = strlen(objroot) + strlen(type) + strlen(path);
+ } else {
+ /*
+ * Long path name. Add 3 for extra '/', '.' and '\0'
+ */
+ len = strlen(objroot) + HASHSTRLEN + strlen(type) +
+ strlen(path) + 3;
+ }
+
+ if ((dfile = malloc(len)) == NULL)
+ return (NULL);
+
+ (void) memset(dfile, 0, len);
+ (void) strcpy(dfile, objroot);
+
+ if (shortpath) {
+ (void) strcat(dfile, type);
+ (void) strcat(dfile, path + 1);
+ return (dfile);
+ }
+
+ /*
+ * If we can successfully load it, call md5_calc().
+ * Otherwise, (this "can't happen") return NULL.
+ */
+ if (real_md5_calc == NULL)
+ load_md5_calc();
+ if (real_md5_calc == (md5_calc_t)(-1)) {
+ free(dfile);
+ return (NULL);
+ }
+
+ real_md5_calc(md5_digest, (unsigned char *)path + 1, strlen(path + 1));
+ __pos4obj_md5toa(hashbuf, md5_digest);
+ (void) strcat(dfile, ".");
+ (void) strcat(dfile, (const char *)hashbuf);
+
+ /*
+ * Errno must be preserved across the following calls to
+ * mkdir. This needs to be done to prevent incorrect error
+ * reporting in certain cases. When we attempt to open a
+ * non-existent object without the O_CREAT flag, it will
+ * always create a lock file first. The lock file is created
+ * and then the open is attempted, but fails with ENOENT. The
+ * lock file is then destroyed. In the following code path, we
+ * are finding the absolute path to the lock file after
+ * already having attempted the open (which set errno to
+ * ENOENT). The following calls to mkdir will return -1 and
+ * set errno to EEXIST, since the hash and type directories
+ * were created when the lock file was created. The correct
+ * errno is the ENOENT from the attempted open of the desired
+ * object.
+ */
+ olderrno = errno;
+
+ /*
+ * Create hash directory. Use 777 permissions so everyone can use it.
+ */
+ if (mkdir(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == 0) {
+ if (chmod(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == -1) {
+ free(dfile);
+ return (NULL);
+ }
+ } else {
+ if (errno != EEXIST) {
+ free(dfile);
+ return (NULL);
+ }
+ }
+
+ (void) strcat(dfile, "/");
+ (void) strcat(dfile, type);
+
+ /*
+ * Create directory for requested type. Use 777 perms so everyone
+ * can use it.
+ */
+ if (mkdir(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == 0) {
+ if (chmod(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == -1) {
+ free(dfile);
+ return (NULL);
+ }
+ } else {
+ if (errno != EEXIST) {
+ free(dfile);
+ return (NULL);
+ }
+ }
+
+ errno = olderrno;
+ (void) strcat(dfile, path);
+ return (dfile);
+}
+
+/*
+ * Takes a 128-bit MD5 digest and transforms to a sequence of 32 ASCII
+ * characters. Output is the hexadecimal representation of the digest.
+ *
+ * The output buffer must be at least HASHSTRLEN + 1 characters
+ * long. HASHSTRLEN is the size of the MD5 digest (128 bits)
+ * divided by the number of bits used per char of output (4). The
+ * extra character at the end is for the NUL terminating character.
+ */
+
+static void
+__pos4obj_md5toa(unsigned char *dest, unsigned char *src)
+{
+ int i;
+ uint32_t *p;
+
+ /* LINTED pointer cast may result in improper alignment */
+ p = (uint32_t *)src;
+
+ for (i = 0; i < (MD5_DIGEST_LENGTH / 4); i++)
+ (void) snprintf((char *)dest + (i * 8), 9, "%.8x", *p++);
+
+ dest[HASHSTRLEN] = '\0';
+}
+
+/*
+ * This open function assume that there is no simultaneous
+ * open/unlink operation is going on. The caller is supposed
+ * to ensure that both open in O_CREAT mode happen atomically.
+ * It returns the crflag as 1 if file is created else 0.
+ */
+int
+__pos4obj_open(const char *name, char *type, int oflag,
+ mode_t mode, int *crflag)
+{
+ int fd;
+ char *dfile;
+
+ errno = 0;
+ *crflag = 0;
+
+ if ((dfile = __pos4obj_name(name, type)) == NULL) {
+ return (-1);
+ }
+
+ if (!(oflag & O_CREAT)) {
+ if ((fd = __open_nc(dfile, oflag, mode)) == -1)
+ __pos4obj_clean(dfile);
+
+ free(dfile);
+ return (fd);
+ }
+
+ /*
+ * We need to make sure that crflag is set iff we actually create
+ * the file. We do this by or'ing in O_EXCL, and attempting an
+ * open. If that fails with an EEXIST, and O_EXCL wasn't specified
+ * by the caller, then the file seems to exist; we'll try an
+ * open with O_CREAT cleared. If that succeeds, then the file
+ * did indeed exist. If that fails with an ENOENT, however, the
+ * file was removed between the opens; we need to take another
+ * lap.
+ */
+ for (;;) {
+ if ((fd = __open_nc(dfile, (oflag | O_EXCL), mode)) == -1) {
+ if (errno == EEXIST && !(oflag & O_EXCL)) {
+ fd = __open_nc(dfile, oflag & ~O_CREAT, mode);
+
+ if (fd == -1 && errno == ENOENT)
+ continue;
+ break;
+ }
+ } else {
+ *crflag = 1;
+ }
+ break;
+ }
+
+ free(dfile);
+ return (fd);
+}
+
+
+int
+__pos4obj_unlink(const char *name, const char *type)
+{
+ int err;
+ char *dfile;
+
+ if ((dfile = __pos4obj_name(name, type)) == NULL) {
+ return (-1);
+ }
+
+ err = unlink(dfile);
+
+ __pos4obj_clean(dfile);
+
+ free(dfile);
+
+ return (err);
+}
+
+/*
+ * This function opens the lock file for each named object
+ * the presence of this file in the file system is the lock
+ */
+int
+__pos4obj_lock(const char *name, const char *ltype)
+{
+ char *dfile;
+ int fd;
+ int limit = 64;
+
+ if ((dfile = __pos4obj_name(name, ltype)) == NULL) {
+ return (-1);
+ }
+
+ while (limit-- > 0) {
+ if ((fd = __open_nc(dfile, O_RDWR | O_CREAT | O_EXCL, 0666))
+ < 0) {
+ if (errno != EEXIST)
+ break;
+ (void) sleep(1);
+ continue;
+ }
+
+ (void) __close_nc(fd);
+ free(dfile);
+ return (1);
+ }
+
+ free(dfile);
+ return (-1);
+}
+
+/*
+ * Unlocks the file by unlinking it from the filesystem
+ */
+int
+__pos4obj_unlock(const char *path, const char *type)
+{
+ return (__pos4obj_unlink(path, type));
+}
+
+/*
+ * Removes unused hash and type directories that may exist in specified path.
+ */
+static void
+__pos4obj_clean(char *path)
+{
+ char *p;
+ int olderrno;
+
+ /*
+ * path is either
+ * 1) /<objroot>/<type><path> or
+ * 2) /<objroot>/.<hash>/<type>/<path>
+ *
+ * In case 1, there is nothing to clean.
+ *
+ * Detect case 2 by looking for a '/' after /objroot/ and
+ * remove the two trailing directories, if empty.
+ */
+ if (strchr(path + strlen(objroot), '/') == NULL)
+ return;
+
+ /*
+ * Preserve errno across calls to rmdir. See block comment in
+ * __pos4obj_name() for explanation.
+ */
+ olderrno = errno;
+
+ if ((p = strrchr(path, '/')) == NULL)
+ return;
+ *p = '\0';
+
+ (void) rmdir(path);
+
+ if ((p = strrchr(path, '/')) == NULL)
+ return;
+ *p = '\0';
+
+ (void) rmdir(path);
+
+ errno = olderrno;
+}
+
+
+/*
+ * Check that path starts with a /, does not contain a / within it
+ * and is not longer than PATH_MAX or NAME_MAX
+ */
+int
+__pos4obj_check(const char *path)
+{
+ long int i;
+
+ /*
+ * This assumes that __pos4obj_check() is called before
+ * any of the other functions in this file
+ */
+ if (name_max == 0 || name_max == -1) {
+ name_max = pathconf(objroot, _PC_NAME_MAX);
+ if (name_max == -1)
+ return (-1);
+ }
+
+ if (*path++ != '/') {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ for (i = 0; *path != '\0'; i++) {
+ if (*path++ == '/') {
+ errno = EINVAL;
+ return (-1);
+ }
+ }
+
+ if (i > PATH_MAX || i > name_max) {
+ errno = ENAMETOOLONG;
+ return (-1);
+ }
+
+ return (0);
+}
diff --git a/usr/src/lib/libc/port/rt/pos4obj.h b/usr/src/lib/libc/port/rt/pos4obj.h
new file mode 100644
index 0000000000..609a43f64c
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/pos4obj.h
@@ -0,0 +1,78 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _POS4OBJ_H
+#define _POS4OBJ_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * pos4obj.h - Header file for POSIX.4 related object names
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* flags used to indicate current state of open */
+#define DFILE_CREATE 0x01
+#define DFILE_OPEN 0x02
+#define ALLOC_MEM 0x04
+#define DFILE_MMAP 0x08
+#define PFILE_CREATE 0x10
+#define NFILE_CREATE 0x20
+#define MQDNP_MMAP 0x40
+
+/* semaphore object types - used in constructing file name */
+#define SEM_DATA_TYPE ".SEMD"
+#define SEM_LOCK_TYPE ".SEML"
+
+/* message queue object types - used in constructing file name */
+#define MQ_DATA_TYPE ".MQD"
+#define MQ_PERM_TYPE ".MQP"
+#define MQ_DSCN_TYPE ".MQN"
+#define MQ_LOCK_TYPE ".MQL"
+
+/* shared memory object types - used in constructing file name */
+#define SHM_DATA_TYPE ".SHMD"
+#define SHM_LOCK_TYPE ".SHML"
+
+/* functions defined related to object names in POSIX.4 */
+extern int __pos4obj_lock(const char *, const char *);
+extern int __pos4obj_unlock(const char *, const char *);
+extern int __pos4obj_unlink(const char *, const char *);
+extern int __pos4obj_open(const char *, char *, int, mode_t, int *);
+extern int __pos4obj_check(const char *);
+
+/* non-cancelable file operations */
+int __open_nc(const char *, int, mode_t);
+int __close_nc(int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POS4OBJ_H */
diff --git a/usr/src/lib/libc/port/rt/sched.c b/usr/src/lib/libc/port/rt/sched.c
new file mode 100644
index 0000000000..58b793f2e2
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/sched.c
@@ -0,0 +1,552 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include "mtlib.h"
+#include <sys/types.h>
+#include <sched.h>
+#include <errno.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/priocntl.h>
+#include <sys/rtpriocntl.h>
+#include <sys/tspriocntl.h>
+#include <sys/rt.h>
+#include <sys/ts.h>
+#include <thread.h>
+#include <string.h>
+#include <stdlib.h>
+#include "rtsched.h"
+
+/*
+ * The following variables are used for caching information
+ * for priocntl scheduling classes.
+ */
+struct pcclass ts_class;
+struct pcclass rt_class;
+struct pcclass ia_class;
+struct pcclass sys_class;
+
+static rtdpent_t *rt_dptbl; /* RT class parameter table */
+
+typedef struct { /* type definition for generic class-specific parameters */
+ int pc_clparms[PC_CLINFOSZ];
+} pc_clparms_t;
+
+static int map_gp_to_rtpri(pri_t);
+
+/*
+ * cache priocntl information on scheduling classes by policy
+ */
+int
+get_info_by_policy(int policy)
+{
+ char *pccname;
+ struct pcclass *pccp;
+
+ if (policy < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ pccp = &rt_class;
+ pccname = "RT";
+ break;
+ case SCHED_OTHER:
+ pccp = &ts_class;
+ pccname = "TS";
+ break;
+ case SCHED_SYS:
+ pccp = &sys_class;
+ pccname = "sys";
+ break;
+ case SCHED_IA:
+ pccp = &ia_class;
+ pccname = "IA";
+ break;
+ default:
+ return (policy);
+ }
+ if (pccp->pcc_state != 0) {
+ if (pccp->pcc_state < 0)
+ errno = ENOSYS;
+ return (pccp->pcc_state);
+ }
+
+ /* get class's info */
+ (void) strcpy(pccp->pcc_info.pc_clname, pccname);
+ if (policy == SCHED_SYS)
+ pccp->pcc_info.pc_cid = 0;
+ else if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&(pccp->pcc_info)) < 0)
+ return (-1);
+
+ if (policy == SCHED_FIFO || policy == SCHED_RR) {
+ pcadmin_t pcadmin;
+ rtadmin_t rtadmin;
+ size_t rtdpsize;
+
+ /* get RT class dispatch table in rt_dptbl */
+ pcadmin.pc_cid = rt_class.pcc_info.pc_cid;
+ pcadmin.pc_cladmin = (caddr_t)&rtadmin;
+ rtadmin.rt_cmd = RT_GETDPSIZE;
+ if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0)
+ return (-1);
+ rtdpsize = (size_t)(rtadmin.rt_ndpents * sizeof (rtdpent_t));
+ if (rt_dptbl == NULL &&
+ (rt_dptbl = lmalloc(rtdpsize)) == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+ rtadmin.rt_dpents = rt_dptbl;
+ rtadmin.rt_cmd = RT_GETDPTBL;
+ if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0)
+ return (-1);
+ pccp->pcc_primin = 0;
+ pccp->pcc_primax = ((rtinfo_t *)rt_class.pcc_info.pc_clinfo)->
+ rt_maxpri;
+ } else if (policy == SCHED_OTHER) {
+ pri_t prio;
+
+ prio = ((tsinfo_t *)ts_class.pcc_info.pc_clinfo)->ts_maxupri/3;
+ pccp->pcc_primin = -prio;
+ pccp->pcc_primax = prio;
+ } else {
+ /* non-RT scheduling class */
+ pcpri_t pcpri;
+
+ /* need RT class info before we can translate priorities */
+ if (get_info_by_policy(SCHED_FIFO) < 0)
+ return (-1);
+ /*
+ * get class's global priority's min, max, and
+ * translate them into RT priority level (index) via rt_dptbl.
+ */
+ pcpri.pc_cid = pccp->pcc_info.pc_cid;
+ if (priocntl(0, 0, PC_GETPRIRANGE, (caddr_t)&pcpri) < 0)
+ return (-1);
+ pccp->pcc_primax = map_gp_to_rtpri(pcpri.pc_clpmax);
+ pccp->pcc_primin = map_gp_to_rtpri(pcpri.pc_clpmin);
+ }
+
+ pccp->pcc_state = 1;
+ return (1);
+}
+
+/*
+ * Translate global scheduling priority to RT class's user priority.
+ * Use the gp values in the rt_dptbl to do a reverse mapping
+ * of a given gpri value relative to the index range of rt_dptbl.
+ */
+static int
+map_gp_to_rtpri(pri_t gpri)
+{
+ rtdpent_t *rtdp;
+ pri_t pri;
+
+ if (gpri <= rt_dptbl[rt_class.pcc_primin].rt_globpri) {
+ pri = gpri - rt_dptbl[rt_class.pcc_primin].rt_globpri + \
+ rt_class.pcc_primin;
+ } else if (gpri >= rt_dptbl[rt_class.pcc_primax].rt_globpri) {
+ pri = gpri - rt_dptbl[rt_class.pcc_primax].rt_globpri + \
+ rt_class.pcc_primax;
+ } else {
+ pri = rt_class.pcc_primin + 1;
+ for (rtdp = rt_dptbl+1; rtdp->rt_globpri < gpri; ++rtdp, ++pri)
+ ;
+ if (rtdp->rt_globpri > gpri)
+ --pri;
+ }
+
+ return (pri);
+}
+
+/*
+ * Translate RT class's user priority to global scheduling priority.
+ */
+pri_t
+map_rtpri_to_gp(pri_t pri)
+{
+ rtdpent_t *rtdp;
+ pri_t gpri;
+
+ if (rt_class.pcc_state == 0)
+ (void) get_info_by_policy(SCHED_FIFO);
+
+ /* First case is the default case, other two are seldomly taken */
+ if (pri <= rt_dptbl[rt_class.pcc_primin].rt_globpri) {
+ gpri = pri + rt_dptbl[rt_class.pcc_primin].rt_globpri -
+ rt_class.pcc_primin;
+ } else if (pri >= rt_dptbl[rt_class.pcc_primax].rt_globpri) {
+ gpri = pri + rt_dptbl[rt_class.pcc_primax].rt_globpri -
+ rt_class.pcc_primax;
+ } else {
+ gpri = rt_dptbl[rt_class.pcc_primin].rt_globpri + 1;
+ for (rtdp = rt_dptbl+1; rtdp->rt_globpri < pri; ++rtdp, ++gpri)
+ ;
+ if (rtdp->rt_globpri > pri)
+ --gpri;
+ }
+ return (gpri);
+}
+
+static int
+get_info_by_class(id_t classid)
+{
+ pcinfo_t pcinfo;
+
+ /* determine if we already know this classid */
+ if (rt_class.pcc_state > 0 && rt_class.pcc_info.pc_cid == classid)
+ return (1);
+ if (ts_class.pcc_state > 0 && ts_class.pcc_info.pc_cid == classid)
+ return (1);
+ if (sys_class.pcc_state > 0 && sys_class.pcc_info.pc_cid == classid)
+ return (1);
+ if (ia_class.pcc_state > 0 && ia_class.pcc_info.pc_cid == classid)
+ return (1);
+
+ pcinfo.pc_cid = classid;
+ if (priocntl(0, 0, PC_GETCLINFO, (caddr_t)&pcinfo) < 0) {
+ if (classid == 0) /* no kernel info for sys class */
+ return (get_info_by_policy(SCHED_SYS));
+ return (-1);
+ }
+
+ if (rt_class.pcc_state == 0 && strcmp(pcinfo.pc_clname, "RT") == 0)
+ return (get_info_by_policy(SCHED_FIFO));
+ if (ts_class.pcc_state == 0 && strcmp(pcinfo.pc_clname, "TS") == 0)
+ return (get_info_by_policy(SCHED_OTHER));
+ if (ia_class.pcc_state == 0 && strcmp(pcinfo.pc_clname, "IA") == 0)
+ return (get_info_by_policy(SCHED_IA));
+
+ return (1);
+}
+
+int
+sched_setparam(pid_t pid, const struct sched_param *param)
+{
+ pri_t prio = param->sched_priority;
+ pcparms_t pcparm;
+ tsparms_t *tsp;
+ tsinfo_t *tsi;
+ int scale;
+
+ if (pid < 0) {
+ errno = ESRCH;
+ return (-1);
+ }
+ if (pid == 0)
+ pid = P_MYID;
+
+ /* get process's current scheduling policy */
+ pcparm.pc_cid = PC_CLNULL;
+ if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1)
+ return (-1);
+ if (get_info_by_class(pcparm.pc_cid) < 0)
+ return (-1);
+
+ if (pcparm.pc_cid == rt_class.pcc_info.pc_cid) {
+ /* SCHED_FIFO or SCHED_RR policy */
+ if (prio < rt_class.pcc_primin || prio > rt_class.pcc_primax) {
+ errno = EINVAL;
+ return (-1);
+ }
+ ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
+ } else if (pcparm.pc_cid == ts_class.pcc_info.pc_cid) {
+ /* SCHED_OTHER policy */
+ tsi = (tsinfo_t *)ts_class.pcc_info.pc_clinfo;
+ scale = tsi->ts_maxupri;
+ tsp = (tsparms_t *)pcparm.pc_clparms;
+ tsp->ts_uprilim = tsp->ts_upri = -(scale * prio) / 20;
+ } else {
+ /*
+ * policy is not defined by POSIX.4.
+ * just pass parameter data through to priocntl.
+ * param should contain an image of class-specific parameters
+ * (after the sched_priority member).
+ */
+ *((pc_clparms_t *)pcparm.pc_clparms) =
+ *((pc_clparms_t *)(&(param->sched_priority)+1));
+ }
+
+ return ((int)priocntl(P_PID, pid, PC_SETPARMS, (caddr_t)&pcparm));
+}
+
+int
+sched_getparam(pid_t pid, struct sched_param *param)
+{
+ pcparms_t pcparm;
+ pri_t prio;
+ int scale;
+ tsinfo_t *tsi;
+
+ if (pid < 0) {
+ errno = ESRCH;
+ return (-1);
+ }
+ if (pid == 0)
+ pid = P_MYID;
+
+ pcparm.pc_cid = PC_CLNULL;
+ if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1)
+ return (-1);
+ if (get_info_by_class(pcparm.pc_cid) < 0)
+ return (-1);
+
+ if (pcparm.pc_cid == rt_class.pcc_info.pc_cid) {
+ param->sched_priority =
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri;
+ } else if (pcparm.pc_cid == ts_class.pcc_info.pc_cid) {
+ param->sched_nicelim =
+ ((tsparms_t *)pcparm.pc_clparms)->ts_uprilim;
+ prio = param->sched_nice =
+ ((tsparms_t *)pcparm.pc_clparms)->ts_upri;
+ tsi = (tsinfo_t *)ts_class.pcc_info.pc_clinfo;
+ scale = tsi->ts_maxupri;
+ if (scale == 0)
+ param->sched_priority = 0;
+ else
+ param->sched_priority = -(prio * 20) / scale;
+ } else {
+ /*
+ * policy is not defined by POSIX.4
+ * just return a copy of pcparams_t image in param.
+ */
+ *((pc_clparms_t *)(&(param->sched_priority)+1)) =
+ *((pc_clparms_t *)pcparm.pc_clparms);
+ param->sched_priority =
+ sched_get_priority_min((int)(pcparm.pc_cid + _SCHED_NEXT));
+ }
+
+ return (0);
+}
+
+int
+sched_setscheduler(pid_t pid, int policy, const struct sched_param *param)
+{
+ pri_t prio = param->sched_priority;
+ pcparms_t pcparm;
+ int oldpolicy;
+ tsinfo_t *tsi;
+ tsparms_t *tsp;
+ int scale;
+
+ if ((oldpolicy = sched_getscheduler(pid)) < 0)
+ return (-1);
+
+ if (pid == 0)
+ pid = P_MYID;
+
+ if (get_info_by_policy(policy) < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ if (prio < rt_class.pcc_primin || prio > rt_class.pcc_primax) {
+ errno = EINVAL;
+ return (-1);
+ }
+ pcparm.pc_cid = rt_class.pcc_info.pc_cid;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
+ ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
+ (policy == SCHED_RR ? RT_TQDEF : RT_TQINF);
+ break;
+
+ case SCHED_OTHER:
+ pcparm.pc_cid = ts_class.pcc_info.pc_cid;
+ tsi = (tsinfo_t *)ts_class.pcc_info.pc_clinfo;
+ scale = tsi->ts_maxupri;
+ tsp = (tsparms_t *)pcparm.pc_clparms;
+ tsp->ts_uprilim = tsp->ts_upri = -(scale * prio) / 20;
+ break;
+
+ default:
+ switch (policy) {
+ case SCHED_SYS:
+ pcparm.pc_cid = sys_class.pcc_info.pc_cid;
+ break;
+ case SCHED_IA:
+ pcparm.pc_cid = ia_class.pcc_info.pc_cid;
+ break;
+ default:
+ pcparm.pc_cid = policy - _SCHED_NEXT;
+ break;
+ }
+ /*
+ * policy is not defined by POSIX.4.
+ * just pass parameter data through to priocntl.
+ * param should contain an image of class-specific parameters
+ * (after the sched_priority member).
+ */
+ *((pc_clparms_t *)pcparm.pc_clparms) =
+ *((pc_clparms_t *)&(param->sched_priority)+1);
+ }
+
+ /* setting scheduling policy & parameters for the process */
+ if (priocntl(P_PID, pid, PC_SETPARMS, (caddr_t)&pcparm) == -1)
+ return (-1);
+
+ return (oldpolicy);
+}
+
+int
+sched_getscheduler(pid_t pid)
+{
+ pcparms_t pcparm;
+ int policy;
+
+ if (pid < 0) {
+ errno = ESRCH;
+ return (-1);
+ }
+ if (pid == 0)
+ pid = P_MYID;
+
+ /* get scheduling policy & parameters for the process */
+ pcparm.pc_cid = PC_CLNULL;
+ if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1)
+ return (-1);
+ if (get_info_by_class(pcparm.pc_cid) < 0)
+ return (-1);
+
+ if (pcparm.pc_cid == rt_class.pcc_info.pc_cid)
+ policy = ((((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+ RT_TQINF ? SCHED_FIFO : SCHED_RR));
+ else if (pcparm.pc_cid == ts_class.pcc_info.pc_cid)
+ policy = SCHED_OTHER;
+ else if (pcparm.pc_cid == sys_class.pcc_info.pc_cid)
+ policy = SCHED_SYS;
+ else if (pcparm.pc_cid == ia_class.pcc_info.pc_cid)
+ policy = SCHED_IA;
+ else {
+ /*
+ * policy is not defined by POSIX.4
+ * return a unique dot4 policy id.
+ */
+ policy = (int)(_SCHED_NEXT + pcparm.pc_cid);
+ }
+
+ return (policy);
+}
+
+int
+sched_yield(void)
+{
+ thr_yield();
+ return (0);
+}
+
+int
+sched_get_priority_max(int policy)
+{
+ pcpri_t pcpri;
+
+ if (get_info_by_policy(policy) < 0)
+ return (-1);
+
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
+ return (rt_class.pcc_primax);
+ else if (policy == SCHED_OTHER)
+ return (ts_class.pcc_primax);
+ else if (policy == SCHED_SYS)
+ return (sys_class.pcc_primax);
+ else if (policy == SCHED_IA)
+ return (ia_class.pcc_primax);
+ else { /* policy not in POSIX.4 */
+ pcpri.pc_cid = policy - _SCHED_NEXT;
+ if (priocntl(0, 0, PC_GETPRIRANGE, (caddr_t)&pcpri) == 0)
+ return (map_gp_to_rtpri(pcpri.pc_clpmax));
+ }
+
+ errno = EINVAL;
+ return (-1);
+}
+
+int
+sched_get_priority_min(int policy)
+{
+ pcpri_t pcpri;
+
+ if (get_info_by_policy(policy) < 0)
+ return (-1);
+
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
+ return (rt_class.pcc_primin);
+ else if (policy == SCHED_OTHER)
+ return (ts_class.pcc_primin);
+ else if (policy == SCHED_SYS)
+ return (sys_class.pcc_primin);
+ else if (policy == SCHED_IA)
+ return (ia_class.pcc_primin);
+ else { /* policy not in POSIX.4 */
+ pcpri.pc_cid = policy - _SCHED_NEXT;
+ if (priocntl(0, 0, PC_GETPRIRANGE, (caddr_t)&pcpri) == 0)
+ return (map_gp_to_rtpri(pcpri.pc_clpmin));
+ }
+
+ errno = EINVAL;
+ return (-1);
+}
+
+int
+sched_rr_get_interval(pid_t pid, timespec_t *interval)
+{
+ pcparms_t pcparm;
+
+ if (pid < 0) {
+ errno = ESRCH;
+ return (-1);
+ }
+ if (pid == 0)
+ pid = P_MYID;
+
+ if (get_info_by_policy(SCHED_RR) < 0)
+ return (-1);
+
+ pcparm.pc_cid = PC_CLNULL;
+ if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1)
+ return (-1);
+
+ if (pcparm.pc_cid == rt_class.pcc_info.pc_cid &&
+ (((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF)) {
+ /* SCHED_RR */
+ interval->tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs;
+ interval->tv_nsec =
+ ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs;
+ return (0);
+ }
+
+ errno = EINVAL;
+ return (-1);
+}
diff --git a/usr/src/lib/libc/port/rt/sem.c b/usr/src/lib/libc/port/rt/sem.c
new file mode 100644
index 0000000000..af3bdcc06a
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/sem.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#pragma weak sem_open = _sem_open
+#pragma weak sem_close = _sem_close
+#pragma weak sem_unlink = _sem_unlink
+#pragma weak sem_init = _sem_init
+#pragma weak sem_destroy = _sem_destroy
+#pragma weak sem_wait = _sem_wait
+#pragma weak sem_timedwait = _sem_timedwait
+#pragma weak sem_reltimedwait_np = _sem_reltimedwait_np
+#pragma weak sem_trywait = _sem_trywait
+#pragma weak sem_post = _sem_post
+#pragma weak sem_getvalue = _sem_getvalue
+
+#include "synonyms.h"
+#include "mtlib.h"
+#include <sys/types.h>
+#include <semaphore.h>
+#include <synch.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <thread.h>
+#include "pos4obj.h"
+
+typedef struct semaddr {
+ struct semaddr *sad_next; /* next in the link */
+ char sad_name[PATH_MAX + 1]; /* name of sem object */
+ sem_t *sad_addr; /* mmapped address of semaphore */
+ ino64_t sad_inode; /* inode # of the mmapped file */
+} semaddr_t;
+
+static long semvaluemax = 0;
+static semaddr_t *semheadp = NULL;
+static mutex_t semlock = DEFAULTMUTEX;
+
+sem_t *
+_sem_open(const char *path, int oflag, /* mode_t mode, int value */ ...)
+{
+ va_list ap;
+ mode_t crmode = 0;
+ sem_t *sem = NULL;
+ struct stat64 statbuf;
+ semaddr_t *next = NULL;
+ int fd = 0;
+ int error = 0;
+ int cr_flag = 0;
+ uint_t value = 0;
+
+ if (__pos4obj_check(path) == -1)
+ return (SEM_FAILED);
+
+ /* acquire semaphore lock to have atomic operation */
+ if (__pos4obj_lock(path, SEM_LOCK_TYPE) < 0)
+ return (SEM_FAILED);
+
+ /* modify oflag to have RDWR and filter CREATE mode only */
+ oflag = (oflag & (O_CREAT|O_EXCL)) | (O_RDWR);
+ if (oflag & O_CREAT) {
+ if (semvaluemax == 0 &&
+ (semvaluemax = _sysconf(_SC_SEM_VALUE_MAX)) <= 0)
+ semvaluemax = -1;
+ va_start(ap, oflag);
+ crmode = va_arg(ap, mode_t);
+ value = va_arg(ap, uint_t);
+ va_end(ap);
+ /* check value < the max for a named semaphore */
+ if (semvaluemax < 0 ||
+ (ulong_t)value > (ulong_t)semvaluemax) {
+ errno = EINVAL;
+ goto out;
+ }
+ }
+
+ errno = 0;
+
+ if ((fd = __pos4obj_open(path, SEM_DATA_TYPE,
+ oflag, crmode, &cr_flag)) < 0)
+ goto out;
+
+ if (cr_flag)
+ cr_flag = DFILE_CREATE | DFILE_OPEN;
+ else
+ cr_flag = DFILE_OPEN;
+
+ /* find out inode # for the opened file */
+ if (fstat64(fd, &statbuf) < 0)
+ goto out;
+
+ /* if created, acquire total_size in the file */
+ if ((cr_flag & DFILE_CREATE) != 0) {
+ if (ftruncate64(fd, (off64_t)sizeof (sem_t)) < 0)
+ goto out;
+ } else {
+ /*
+ * if this semaphore has already been opened, inode
+ * will indicate then return the same semaphore address
+ */
+ lmutex_lock(&semlock);
+ for (next = semheadp; next != NULL; next = next->sad_next) {
+ if (statbuf.st_ino == next->sad_inode &&
+ strcmp(path, next->sad_name) == 0) {
+ (void) __close_nc(fd);
+ lmutex_unlock(&semlock);
+ (void) __pos4obj_unlock(path, SEM_LOCK_TYPE);
+ return (next->sad_addr);
+ }
+ }
+ lmutex_unlock(&semlock);
+ }
+
+
+ /* new sem descriptor to be allocated and new address to be mapped */
+ if ((next = malloc(sizeof (semaddr_t))) == NULL) {
+ errno = ENOMEM;
+ goto out;
+ }
+ cr_flag |= ALLOC_MEM;
+
+ /* LINTED */
+ sem = (sem_t *)mmap64(NULL, sizeof (sem_t), PROT_READ|PROT_WRITE,
+ MAP_SHARED, fd, (off64_t)0);
+ (void) __close_nc(fd);
+ cr_flag &= ~DFILE_OPEN;
+ if (sem == MAP_FAILED)
+ goto out;
+ cr_flag |= DFILE_MMAP;
+
+ /* if created, initialize */
+ if (cr_flag & DFILE_CREATE) {
+ error = sema_init((sema_t *)sem, value, USYNC_PROCESS, 0);
+ if (error) {
+ errno = error;
+ goto out;
+ }
+ }
+
+ if (__pos4obj_unlock(path, SEM_LOCK_TYPE) == 0) {
+ /* add to the list pointed by semheadp */
+ lmutex_lock(&semlock);
+ next->sad_next = semheadp;
+ semheadp = next;
+ next->sad_addr = sem;
+ next->sad_inode = statbuf.st_ino;
+ (void) strcpy(next->sad_name, path);
+ lmutex_unlock(&semlock);
+ return (sem);
+ }
+ /* fall into the error case */
+out:
+ error = errno;
+ if ((cr_flag & DFILE_OPEN) != 0)
+ (void) __close_nc(fd);
+ if ((cr_flag & DFILE_CREATE) != 0)
+ (void) __pos4obj_unlink(path, SEM_DATA_TYPE);
+ if ((cr_flag & ALLOC_MEM) != 0)
+ free(next);
+ if ((cr_flag & DFILE_MMAP) != 0)
+ (void) munmap((caddr_t)sem, sizeof (sem_t));
+ (void) __pos4obj_unlock(path, SEM_LOCK_TYPE);
+ errno = error;
+ return (SEM_FAILED);
+}
+
+int
+_sem_close(sem_t *sem)
+{
+ semaddr_t **next;
+ semaddr_t *freeit;
+
+ lmutex_lock(&semlock);
+ for (next = &semheadp; (freeit = *next) != NULL;
+ next = &(freeit->sad_next)) {
+ if (freeit->sad_addr == sem) {
+ *next = freeit->sad_next;
+ lmutex_unlock(&semlock);
+ free(freeit);
+ return (munmap((caddr_t)sem, sizeof (sem_t)));
+ }
+ }
+ lmutex_unlock(&semlock);
+ errno = EINVAL;
+ return (-1);
+}
+
+int
+_sem_unlink(const char *path)
+{
+ int error;
+ int oerrno;
+
+ if (__pos4obj_check(path) < 0)
+ return (-1);
+
+ if (__pos4obj_lock(path, SEM_LOCK_TYPE) < 0)
+ return (-1);
+
+ error = __pos4obj_unlink(path, SEM_DATA_TYPE);
+
+ oerrno = errno;
+
+ (void) __pos4obj_unlock(path, SEM_LOCK_TYPE);
+
+ errno = oerrno;
+
+ return (error);
+}
+
+/*
+ * SUSV3 requires ("shall fail") an EINVAL failure for operations
+ * on invalid semaphores, including uninitialized unnamed semaphores.
+ * The best we can do is check that the magic number is correct.
+ * This is not perfect, but it allows the test suite to pass.
+ * (Standards bodies are filled with fools and idiots.)
+ */
+static int
+sem_invalid(sem_t *sem)
+{
+ if (sem->sem_magic != SEMA_MAGIC) {
+ errno = EINVAL;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_init(sem_t *sem, int pshared, uint_t value)
+{
+ int error;
+
+ if ((error = sema_init((sema_t *)sem, value,
+ pshared ? USYNC_PROCESS : USYNC_THREAD, NULL)) != 0) {
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_destroy(sem_t *sem)
+{
+ int error;
+
+ if (sem_invalid(sem))
+ return (-1);
+ if ((error = sema_destroy((sema_t *)sem)) != 0) {
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_post(sem_t *sem)
+{
+ int error;
+
+ if (sem_invalid(sem))
+ return (-1);
+ if ((error = sema_post((sema_t *)sem)) != 0) {
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_wait(sem_t *sem)
+{
+ int error;
+
+ if (sem_invalid(sem))
+ return (-1);
+ if ((error = sema_wait((sema_t *)sem)) != 0) {
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_timedwait(sem_t *sem, const timespec_t *abstime)
+{
+ int error;
+
+ if (sem_invalid(sem))
+ return (-1);
+ if ((error = sema_timedwait((sema_t *)sem, abstime)) != 0) {
+ if (error == ETIME)
+ error = ETIMEDOUT;
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_reltimedwait_np(sem_t *sem, const timespec_t *reltime)
+{
+ int error;
+
+ if (sem_invalid(sem))
+ return (-1);
+ if ((error = sema_reltimedwait((sema_t *)sem, reltime)) != 0) {
+ if (error == ETIME)
+ error = ETIMEDOUT;
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_trywait(sem_t *sem)
+{
+ int error;
+
+ if (sem_invalid(sem))
+ return (-1);
+ if ((error = sema_trywait((sema_t *)sem)) != 0) {
+ if (error == EBUSY)
+ error = EAGAIN;
+ errno = error;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+_sem_getvalue(sem_t *sem, int *sval)
+{
+ if (sem_invalid(sem))
+ return (-1);
+ *sval = (int)sem->sem_count;
+ return (0);
+}
diff --git a/usr/src/lib/libc/port/rt/shm.c b/usr/src/lib/libc/port/rt/shm.c
new file mode 100644
index 0000000000..53c59d9424
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/shm.c
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <errno.h>
+#include "pos4obj.h"
+
+int
+shm_open(const char *path, int oflag, mode_t mode)
+{
+ int crflag;
+ int fd;
+ int flags;
+
+ if (__pos4obj_check(path) == -1)
+ return (-1);
+
+ /* acquire semaphore lock to have atomic operation */
+ if (__pos4obj_lock(path, SHM_LOCK_TYPE) < 0)
+ return (-1);
+
+ fd = __pos4obj_open(path, SHM_DATA_TYPE, oflag, mode, &crflag);
+
+ if (fd < 0) {
+ (void) __pos4obj_unlock(path, SHM_LOCK_TYPE);
+ return (-1);
+ }
+
+ if ((flags = fcntl(fd, F_GETFD)) < 0 ||
+ fcntl(fd, F_SETFD, flags | FD_CLOEXEC) < 0) {
+ (void) __pos4obj_unlock(path, SHM_LOCK_TYPE);
+ (void) __close_nc(fd);
+ return (-1);
+ }
+
+ /* relase semaphore lock operation */
+ if (__pos4obj_unlock(path, SHM_LOCK_TYPE) < 0) {
+ (void) __close_nc(fd);
+ return (-1);
+ }
+
+ return (fd);
+}
+
+int
+shm_unlink(const char *path)
+{
+ int oerrno;
+ int err;
+
+ if (__pos4obj_check(path) < 0)
+ return (-1);
+
+ if (__pos4obj_lock(path, SHM_LOCK_TYPE) < 0)
+ return (-1);
+
+ err = __pos4obj_unlink(path, SHM_DATA_TYPE);
+
+ oerrno = errno;
+
+ (void) __pos4obj_unlock(path, SHM_LOCK_TYPE);
+
+ errno = oerrno;
+ return (err);
+
+}
diff --git a/usr/src/lib/libc/port/rt/sigev_thread.c b/usr/src/lib/libc/port/rt/sigev_thread.c
new file mode 100644
index 0000000000..0ab6eaccdf
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/sigev_thread.c
@@ -0,0 +1,715 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include "thr_uberdata.h"
+#include <sys/types.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <thread.h>
+#include <pthread.h>
+#include <synch.h>
+#include <port.h>
+#include <signal.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sys/aiocb.h>
+#include <time.h>
+#include <signal.h>
+#include <fcntl.h>
+#include "sigev_thread.h"
+
+/*
+ * There is but one spawner for all aio operations.
+ */
+thread_communication_data_t *sigev_aio_tcd = NULL;
+
+/*
+ * Set non-zero via _RT_DEBUG to enable debugging printf's.
+ */
+static int _rt_debug = 0;
+
+void
+init_sigev_thread(void)
+{
+ char *ldebug;
+
+ if ((ldebug = getenv("_RT_DEBUG")) != NULL)
+ _rt_debug = atoi(ldebug);
+}
+
+/*
+ * Routine to print debug messages:
+ * If _rt_debug is set, printf the debug message to stderr
+ * with an appropriate prefix.
+ */
+/*PRINTFLIKE1*/
+static void
+dprintf(const char *format, ...)
+{
+ if (_rt_debug) {
+ va_list alist;
+
+ va_start(alist, format);
+ flockfile(stderr);
+ (void) fputs("DEBUG: ", stderr);
+ (void) vfprintf(stderr, format, alist);
+ funlockfile(stderr);
+ va_end(alist);
+ }
+}
+
+/*
+ * The notify_thread() function can be used as the start function of a new
+ * thread but it is normally called from notifier(), below, in the context
+ * of a thread pool worker thread. It is used as the start function of a
+ * new thread only when individual pthread attributes differ from those
+ * that are common to all workers. This only occurs in the AIO case.
+ */
+static void *
+notify_thread(void *arg)
+{
+ sigev_thread_data_t *stdp = arg;
+ void (*function)(union sigval) = stdp->std_func;
+ union sigval argument = stdp->std_arg;
+
+ lfree(stdp, sizeof (*stdp));
+ function(argument);
+ return (NULL);
+}
+
+/*
+ * Thread pool interface to call the user-supplied notification function.
+ */
+static void
+notifier(void *arg)
+{
+ (void) notify_thread(arg);
+}
+
+/*
+ * This routine adds a new work request, described by function
+ * and argument, to the list of outstanding jobs.
+ * It returns 0 indicating success. A value != 0 indicates an error.
+ */
+static int
+sigev_add_work(thread_communication_data_t *tcdp,
+ void (*function)(union sigval), union sigval argument)
+{
+ tpool_t *tpool = tcdp->tcd_poolp;
+ sigev_thread_data_t *stdp;
+
+ if (tpool == NULL)
+ return (EINVAL);
+ if ((stdp = lmalloc(sizeof (*stdp))) == NULL)
+ return (errno);
+ stdp->std_func = function;
+ stdp->std_arg = argument;
+ if (tpool_dispatch(tpool, notifier, stdp) != 0) {
+ lfree(stdp, sizeof (*stdp));
+ return (errno);
+ }
+ return (0);
+}
+
+static void
+sigev_destroy_pool(thread_communication_data_t *tcdp)
+{
+ if (tcdp->tcd_poolp != NULL)
+ tpool_abandon(tcdp->tcd_poolp);
+ tcdp->tcd_poolp = NULL;
+
+ if (tcdp->tcd_subsystem == MQ) {
+ /*
+ * synchronize with del_sigev_mq()
+ */
+ sig_mutex_lock(&tcdp->tcd_lock);
+ tcdp->tcd_server_id = 0;
+ if (tcdp->tcd_msg_closing) {
+ (void) cond_broadcast(&tcdp->tcd_cv);
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ return; /* del_sigev_mq() will free the tcd */
+ }
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ }
+
+ /*
+ * now delete everything
+ */
+ free_sigev_handler(tcdp);
+}
+
+/*
+ * timer_spawner(), mqueue_spawner(), and aio_spawner() are the main
+ * functions for the daemon threads that get the event(s) for the
+ * respective SIGEV_THREAD subsystems. There is one timer spawner for
+ * each timer_create(), one mqueue spawner for every mq_open(), and
+ * exactly one aio spawner for all aio requests. These spawners add
+ * work requests to be done by a pool of daemon worker threads. In case
+ * the event requires creation of a worker thread with different pthread
+ * attributes than those from the pool of workers, a new daemon thread
+ * with these attributes is spawned apart from the pool of workers.
+ * If the spawner fails to add work or fails to create an additional
+ * thread because of lacking resources, it puts the event back into
+ * the kernel queue and re-tries some time later.
+ */
+
+void *
+timer_spawner(void *arg)
+{
+ thread_communication_data_t *tcdp = (thread_communication_data_t *)arg;
+ port_event_t port_event;
+
+ /* destroy the pool if we are cancelled */
+ pthread_cleanup_push(sigev_destroy_pool, tcdp);
+
+ for (;;) {
+ if (port_get(tcdp->tcd_port, &port_event, NULL) != 0) {
+ dprintf("port_get on port %d failed with %d <%s>\n",
+ tcdp->tcd_port, errno, strerror(errno));
+ break;
+ }
+ switch (port_event.portev_source) {
+ case PORT_SOURCE_TIMER:
+ break;
+ case PORT_SOURCE_ALERT:
+ if (port_event.portev_events != SIGEV_THREAD_TERM)
+ errno = EPROTO;
+ goto out;
+ default:
+ dprintf("port_get on port %d returned %u "
+ "(not PORT_SOURCE_TIMER)\n",
+ tcdp->tcd_port, port_event.portev_source);
+ errno = EPROTO;
+ goto out;
+ }
+
+ tcdp->tcd_overruns = port_event.portev_events - 1;
+ if (sigev_add_work(tcdp,
+ tcdp->tcd_notif.sigev_notify_function,
+ tcdp->tcd_notif.sigev_value) != 0)
+ break;
+ /* wait until job is done before looking for another */
+ tpool_wait(tcdp->tcd_poolp);
+ }
+out:
+ pthread_cleanup_pop(1);
+ return (NULL);
+}
+
+void *
+mqueue_spawner(void *arg)
+{
+ thread_communication_data_t *tcdp = (thread_communication_data_t *)arg;
+ int ret = 0;
+ int ntype;
+ void (*function)(union sigval);
+ union sigval argument;
+
+ /* destroy the pool if we are cancelled */
+ pthread_cleanup_push(sigev_destroy_pool, tcdp);
+
+ while (ret == 0) {
+ sig_mutex_lock(&tcdp->tcd_lock);
+ pthread_cleanup_push(sig_mutex_unlock, &tcdp->tcd_lock);
+ while ((ntype = tcdp->tcd_msg_enabled) == 0)
+ (void) sig_cond_wait(&tcdp->tcd_cv, &tcdp->tcd_lock);
+ pthread_cleanup_pop(1);
+
+ while (sem_wait(tcdp->tcd_msg_avail) == -1)
+ continue;
+
+ sig_mutex_lock(&tcdp->tcd_lock);
+ tcdp->tcd_msg_enabled = 0;
+ sig_mutex_unlock(&tcdp->tcd_lock);
+
+ /* ASSERT(ntype == SIGEV_THREAD || ntype == SIGEV_PORT); */
+ if (ntype == SIGEV_THREAD) {
+ function = tcdp->tcd_notif.sigev_notify_function;
+ argument.sival_ptr = tcdp->tcd_msg_userval;
+ ret = sigev_add_work(tcdp, function, argument);
+ } else { /* ntype == SIGEV_PORT */
+ ret = _port_dispatch(tcdp->tcd_port, 0, PORT_SOURCE_MQ,
+ 0, (uintptr_t)tcdp->tcd_msg_object,
+ tcdp->tcd_msg_userval);
+ }
+ }
+ sig_mutex_unlock(&tcdp->tcd_lock);
+
+ pthread_cleanup_pop(1);
+ return (NULL);
+}
+
+void *
+aio_spawner(void *arg)
+{
+ thread_communication_data_t *tcdp = (thread_communication_data_t *)arg;
+ int error = 0;
+ void (*function)(union sigval);
+ union sigval argument;
+ port_event_t port_event;
+ struct sigevent *sigevp;
+ timespec_t delta;
+ pthread_attr_t *attrp;
+
+ /* destroy the pool if we are cancelled */
+ pthread_cleanup_push(sigev_destroy_pool, tcdp);
+
+ while (error == 0) {
+ if (port_get(tcdp->tcd_port, &port_event, NULL) != 0) {
+ error = errno;
+ dprintf("port_get on port %d failed with %d <%s>\n",
+ tcdp->tcd_port, error, strerror(error));
+ break;
+ }
+ switch (port_event.portev_source) {
+ case PORT_SOURCE_AIO:
+ break;
+ case PORT_SOURCE_ALERT:
+ if (port_event.portev_events != SIGEV_THREAD_TERM)
+ errno = EPROTO;
+ goto out;
+ default:
+ dprintf("port_get on port %d returned %u "
+ "(not PORT_SOURCE_AIO)\n",
+ tcdp->tcd_port, port_event.portev_source);
+ errno = EPROTO;
+ goto out;
+ }
+ argument.sival_ptr = port_event.portev_user;
+ switch (port_event.portev_events) {
+ case AIOLIO:
+#if !defined(_LP64)
+ case AIOLIO64:
+#endif
+ sigevp = (struct sigevent *)port_event.portev_object;
+ function = sigevp->sigev_notify_function;
+ attrp = sigevp->sigev_notify_attributes;
+ break;
+ case AIOAREAD:
+ case AIOAWRITE:
+ case AIOFSYNC:
+ {
+ aiocb_t *aiocbp =
+ (aiocb_t *)port_event.portev_object;
+ function = aiocbp->aio_sigevent.sigev_notify_function;
+ attrp = aiocbp->aio_sigevent.sigev_notify_attributes;
+ break;
+ }
+#if !defined(_LP64)
+ case AIOAREAD64:
+ case AIOAWRITE64:
+ case AIOFSYNC64:
+ {
+ aiocb64_t *aiocbp =
+ (aiocb64_t *)port_event.portev_object;
+ function = aiocbp->aio_sigevent.sigev_notify_function;
+ attrp = aiocbp->aio_sigevent.sigev_notify_attributes;
+ break;
+ }
+#endif
+ default:
+ function = NULL;
+ attrp = NULL;
+ break;
+ }
+
+ if (function == NULL)
+ error = EINVAL;
+ else if (_pthread_attr_equal(attrp, tcdp->tcd_attrp))
+ error = sigev_add_work(tcdp, function, argument);
+ else {
+ /*
+ * The attributes don't match.
+ * Spawn a thread with the non-matching attributes.
+ */
+ pthread_attr_t local_attr;
+ sigev_thread_data_t *stdp;
+
+ if ((stdp = lmalloc(sizeof (*stdp))) == NULL)
+ error = ENOMEM;
+ else
+ error = _pthread_attr_clone(&local_attr, attrp);
+
+ if (error == 0) {
+ (void) pthread_attr_setdetachstate(
+ &local_attr, PTHREAD_CREATE_DETACHED);
+ (void) _pthread_attr_setdaemonstate_np(
+ &local_attr, PTHREAD_CREATE_DAEMON_NP);
+ stdp->std_func = function;
+ stdp->std_arg = argument;
+ error = pthread_create(NULL, &local_attr,
+ notify_thread, stdp);
+ (void) pthread_attr_destroy(&local_attr);
+ }
+ if (error && stdp != NULL)
+ lfree(stdp, sizeof (*stdp));
+ }
+
+ if (error) {
+ dprintf("Cannot add work, error=%d <%s>.\n",
+ error, strerror(error));
+ if (error == EAGAIN || error == ENOMEM) {
+ /* (Temporary) no resources are available. */
+ if (_port_dispatch(tcdp->tcd_port, 0,
+ PORT_SOURCE_AIO, port_event.portev_events,
+ port_event.portev_object,
+ port_event.portev_user) != 0)
+ break;
+ error = 0;
+ delta.tv_sec = 0;
+ delta.tv_nsec = NANOSEC / 20; /* 50 msec */
+ (void) nanosleep(&delta, NULL);
+ }
+ }
+ }
+out:
+ pthread_cleanup_pop(1);
+ return (NULL);
+}
+
+/*
+ * Allocate a thread_communication_data_t block.
+ */
+static thread_communication_data_t *
+alloc_sigev_handler(subsystem_t caller)
+{
+ thread_communication_data_t *tcdp;
+
+ if ((tcdp = lmalloc(sizeof (*tcdp))) != NULL) {
+ tcdp->tcd_subsystem = caller;
+ tcdp->tcd_port = -1;
+ (void) mutex_init(&tcdp->tcd_lock, USYNC_THREAD, NULL);
+ (void) cond_init(&tcdp->tcd_cv, USYNC_THREAD, NULL);
+ }
+ return (tcdp);
+}
+
+/*
+ * Free a thread_communication_data_t block.
+ */
+void
+free_sigev_handler(thread_communication_data_t *tcdp)
+{
+ if (tcdp->tcd_attrp) {
+ (void) pthread_attr_destroy(tcdp->tcd_attrp);
+ tcdp->tcd_attrp = NULL;
+ }
+ (void) memset(&tcdp->tcd_notif, 0, sizeof (tcdp->tcd_notif));
+
+ switch (tcdp->tcd_subsystem) {
+ case TIMER:
+ case AIO:
+ if (tcdp->tcd_port >= 0)
+ (void) close(tcdp->tcd_port);
+ break;
+ case MQ:
+ tcdp->tcd_msg_avail = NULL;
+ tcdp->tcd_msg_object = NULL;
+ tcdp->tcd_msg_userval = NULL;
+ tcdp->tcd_msg_enabled = 0;
+ break;
+ }
+
+ lfree(tcdp, sizeof (*tcdp));
+}
+
+/*
+ * Initialize data structure and create the port.
+ */
+thread_communication_data_t *
+setup_sigev_handler(const struct sigevent *sigevp, subsystem_t caller)
+{
+ thread_communication_data_t *tcdp;
+ int error;
+
+ if (sigevp == NULL) {
+ errno = EINVAL;
+ return (NULL);
+ }
+
+ if ((tcdp = alloc_sigev_handler(caller)) == NULL) {
+ errno = ENOMEM;
+ return (NULL);
+ }
+
+ if (sigevp->sigev_notify_attributes == NULL)
+ tcdp->tcd_attrp = NULL; /* default attributes */
+ else {
+ /*
+ * We cannot just copy the sigevp->sigev_notify_attributes
+ * pointer. We need to initialize a new pthread_attr_t
+ * structure with the values from the user-supplied
+ * pthread_attr_t.
+ */
+ tcdp->tcd_attrp = &tcdp->tcd_user_attr;
+ error = _pthread_attr_clone(tcdp->tcd_attrp,
+ sigevp->sigev_notify_attributes);
+ if (error) {
+ tcdp->tcd_attrp = NULL;
+ free_sigev_handler(tcdp);
+ errno = error;
+ return (NULL);
+ }
+ }
+ tcdp->tcd_notif = *sigevp;
+ tcdp->tcd_notif.sigev_notify_attributes = tcdp->tcd_attrp;
+
+ if (caller == TIMER || caller == AIO) {
+ if ((tcdp->tcd_port = port_create()) < 0 ||
+ fcntl(tcdp->tcd_port, FD_CLOEXEC) == -1) {
+ free_sigev_handler(tcdp);
+ errno = EBADF;
+ return (NULL);
+ }
+ }
+ return (tcdp);
+}
+
+/*
+ * Create a thread pool and launch the spawner.
+ */
+int
+launch_spawner(thread_communication_data_t *tcdp)
+{
+ int ret;
+ int maxworkers;
+ void *(*spawner)(void *);
+ sigset_t set;
+ sigset_t oset;
+
+ switch (tcdp->tcd_subsystem) {
+ case TIMER:
+ spawner = timer_spawner;
+ maxworkers = 1;
+ break;
+ case MQ:
+ spawner = mqueue_spawner;
+ maxworkers = 1;
+ break;
+ case AIO:
+ spawner = aio_spawner;
+ maxworkers = 100;
+ break;
+ default:
+ return (-1);
+ }
+ tcdp->tcd_poolp = tpool_create(1, maxworkers, 20,
+ tcdp->tcd_notif.sigev_notify_attributes);
+ if (tcdp->tcd_poolp == NULL)
+ return (-1);
+ /* create the spawner with all signals blocked */
+ (void) sigfillset(&set);
+ (void) thr_sigsetmask(SIG_SETMASK, &set, &oset);
+ ret = thr_create(NULL, 0, spawner, tcdp,
+ THR_DETACHED | THR_DAEMON, &tcdp->tcd_server_id);
+ (void) thr_sigsetmask(SIG_SETMASK, &oset, NULL);
+ if (ret != 0) {
+ tpool_destroy(tcdp->tcd_poolp);
+ tcdp->tcd_poolp = NULL;
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Delete the data associated with the sigev_thread timer, if timer is
+ * associated with such a notification option.
+ * Destroy the timer_spawner thread.
+ */
+int
+del_sigev_timer(timer_t timer)
+{
+ int rc = 0;
+ thread_communication_data_t *tcdp;
+
+ if ((uint_t)timer < timer_max && (tcdp = timer_tcd[timer]) != NULL) {
+ sig_mutex_lock(&tcdp->tcd_lock);
+ if (tcdp->tcd_port >= 0) {
+ if ((rc = port_alert(tcdp->tcd_port,
+ PORT_ALERT_SET, SIGEV_THREAD_TERM, NULL)) == 0) {
+ dprintf("del_sigev_timer(%d) OK.\n", timer);
+ }
+ }
+ timer_tcd[timer] = NULL;
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ }
+ return (rc);
+}
+
+int
+sigev_timer_getoverrun(timer_t timer)
+{
+ thread_communication_data_t *tcdp;
+
+ if ((uint_t)timer < timer_max && (tcdp = timer_tcd[timer]) != NULL)
+ return (tcdp->tcd_overruns);
+ return (0);
+}
+
+static void
+del_sigev_mq_cleanup(thread_communication_data_t *tcdp)
+{
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ free_sigev_handler(tcdp);
+}
+
+/*
+ * Delete the data associated with the sigev_thread message queue,
+ * if the message queue is associated with such a notification option.
+ * Destroy the mqueue_spawner thread.
+ */
+void
+del_sigev_mq(thread_communication_data_t *tcdp)
+{
+ pthread_t server_id;
+ int rc;
+
+ sig_mutex_lock(&tcdp->tcd_lock);
+
+ server_id = tcdp->tcd_server_id;
+ tcdp->tcd_msg_closing = 1;
+ if ((rc = pthread_cancel(server_id)) != 0) { /* "can't happen" */
+ sig_mutex_unlock(&tcdp->tcd_lock);
+ dprintf("Fail to cancel %u with error %d <%s>.\n",
+ server_id, rc, strerror(rc));
+ return;
+ }
+
+ /*
+ * wait for sigev_destroy_pool() to finish
+ */
+ pthread_cleanup_push(del_sigev_mq_cleanup, tcdp);
+ while (tcdp->tcd_server_id == server_id)
+ (void) sig_cond_wait(&tcdp->tcd_cv, &tcdp->tcd_lock);
+ pthread_cleanup_pop(1);
+}
+
+/*
+ * POSIX aio:
+ * If the notification type is SIGEV_THREAD, set up
+ * the port number for notifications. Create the
+ * thread pool and launch the spawner if necessary.
+ * If the notification type is not SIGEV_THREAD, do nothing.
+ */
+int
+_aio_sigev_thread_init(struct sigevent *sigevp)
+{
+ static mutex_t sigev_aio_lock = DEFAULTMUTEX;
+ static cond_t sigev_aio_cv = DEFAULTCV;
+ static int sigev_aio_busy = 0;
+
+ thread_communication_data_t *tcdp;
+ int port;
+ int rc = 0;
+
+ if (sigevp == NULL ||
+ sigevp->sigev_notify != SIGEV_THREAD ||
+ sigevp->sigev_notify_function == NULL)
+ return (0);
+
+ lmutex_lock(&sigev_aio_lock);
+ while (sigev_aio_busy)
+ (void) _cond_wait(&sigev_aio_cv, &sigev_aio_lock);
+ if ((tcdp = sigev_aio_tcd) != NULL)
+ port = tcdp->tcd_port;
+ else {
+ sigev_aio_busy = 1;
+ lmutex_unlock(&sigev_aio_lock);
+
+ tcdp = setup_sigev_handler(sigevp, AIO);
+ if (tcdp == NULL) {
+ port = -1;
+ rc = -1;
+ } else if (launch_spawner(tcdp) != 0) {
+ free_sigev_handler(tcdp);
+ tcdp = NULL;
+ port = -1;
+ rc = -1;
+ } else {
+ port = tcdp->tcd_port;
+ }
+
+ lmutex_lock(&sigev_aio_lock);
+ sigev_aio_tcd = tcdp;
+ sigev_aio_busy = 0;
+ (void) cond_broadcast(&sigev_aio_cv);
+ }
+ lmutex_unlock(&sigev_aio_lock);
+ sigevp->sigev_signo = port;
+ return (rc);
+}
+
+int
+_aio_sigev_thread(aiocb_t *aiocbp)
+{
+ if (aiocbp == NULL)
+ return (0);
+ return (_aio_sigev_thread_init(&aiocbp->aio_sigevent));
+}
+
+#if !defined(_LP64)
+int
+_aio_sigev_thread64(aiocb64_t *aiocbp)
+{
+ if (aiocbp == NULL)
+ return (0);
+ return (_aio_sigev_thread_init(&aiocbp->aio_sigevent));
+}
+#endif
+
+/*
+ * Cleanup POSIX aio after fork1() in the child process.
+ */
+void
+postfork1_child_sigev_aio(void)
+{
+ thread_communication_data_t *tcdp;
+
+ if ((tcdp = sigev_aio_tcd) != NULL) {
+ sigev_aio_tcd = NULL;
+ tcd_teardown(tcdp);
+ }
+}
+
+/*
+ * Utility function for the various postfork1_child_sigev_*() functions.
+ * Clean up the tcdp data structure and close the port.
+ */
+void
+tcd_teardown(thread_communication_data_t *tcdp)
+{
+ if (tcdp->tcd_poolp != NULL)
+ tpool_abandon(tcdp->tcd_poolp);
+ tcdp->tcd_poolp = NULL;
+ tcdp->tcd_server_id = 0;
+ free_sigev_handler(tcdp);
+}
diff --git a/usr/src/lib/libc/port/rt/sigev_thread.h b/usr/src/lib/libc/port/rt/sigev_thread.h
new file mode 100644
index 0000000000..943cb8ab23
--- /dev/null
+++ b/usr/src/lib/libc/port/rt/sigev_thread.h
@@ -0,0 +1,117 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SIGEV_THREAD_H
+#define _SIGEV_THREAD_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <signal.h>
+#include <port.h>
+#include <mqueue.h>
+#include <time.h>
+#include <limits.h>
+#include <semaphore.h>
+#include <thread_pool.h>
+
+#define SIGEV_THREAD_TERM 1
+
+typedef enum {TIMER = 1, MQ, AIO} subsystem_t; /* Calling sub-system */
+
+typedef struct {
+ void (*std_func)(union sigval); /* User-defined notification function */
+ union sigval std_arg; /* Parameter of user-defined notification fct */
+} sigev_thread_data_t;
+
+typedef struct thread_communication_data {
+ struct thread_communication_data *tcd_next;
+ struct sigevent tcd_notif; /* encapsulates usr fct and usr vals */
+ pthread_attr_t tcd_user_attr; /* copy of caller's attributes */
+ pthread_attr_t *tcd_attrp; /* NULL if caller passed NULL */
+ int tcd_port; /* port this spawner is controlling */
+ thread_t tcd_server_id; /* thread id of server thread */
+ subsystem_t tcd_subsystem; /* event generating subsystem */
+ tpool_t *tcd_poolp; /* worker thread pool */
+ /* for creation/termination synchronization protocol */
+ mutex_t tcd_lock;
+ cond_t tcd_cv;
+ /* subsystem-specific data */
+ union {
+ struct {
+ int overruns; /* number of overruns */
+ } timer;
+ struct {
+ int msg_enabled; /* notification enabled */
+ int msg_closing; /* mq_close() is waiting */
+ sem_t *msg_avail; /* wait for message available */
+ void *msg_object; /* mqd_t */
+ void *msg_userval; /* notification user value */
+ } mqueue;
+ } tcd_object;
+} thread_communication_data_t;
+
+#define tcd_overruns tcd_object.timer.overruns
+
+#define tcd_msg_enabled tcd_object.mqueue.msg_enabled
+#define tcd_msg_closing tcd_object.mqueue.msg_closing
+#define tcd_msg_avail tcd_object.mqueue.msg_avail
+#define tcd_msg_object tcd_object.mqueue.msg_object
+#define tcd_msg_userval tcd_object.mqueue.msg_userval
+
+/* Generic functions common to all entities */
+extern thread_communication_data_t *setup_sigev_handler(
+ const struct sigevent *, subsystem_t);
+extern void free_sigev_handler(thread_communication_data_t *);
+extern int launch_spawner(thread_communication_data_t *);
+extern void tcd_teardown(thread_communication_data_t *);
+
+/* Additional functions for different entities */
+extern void *timer_spawner(void *);
+extern int del_sigev_timer(timer_t);
+extern int sigev_timer_getoverrun(timer_t);
+extern void *mqueue_spawner(void *);
+extern void del_sigev_mq(thread_communication_data_t *);
+extern void *aio_spawner(void *);
+
+/* Private interfaces elsewhere in libc */
+extern int _pthread_attr_clone(pthread_attr_t *, const pthread_attr_t *);
+extern int _pthread_attr_equal(const pthread_attr_t *, const pthread_attr_t *);
+extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
+
+extern thread_communication_data_t *sigev_aio_tcd;
+
+extern int timer_max;
+extern thread_communication_data_t **timer_tcd;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SIGEV_THREAD_H */
diff --git a/usr/src/lib/libc/port/sys/fsync.c b/usr/src/lib/libc/port/sys/fsync.c
index d6827f60f3..f727d5914f 100644
--- a/usr/src/lib/libc/port/sys/fsync.c
+++ b/usr/src/lib/libc/port/sys/fsync.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -31,15 +31,20 @@
/*
* fsync(int fd)
- *
+ * fdatasync(int fd)
*/
#include "synonyms.h"
-#include <sys/types.h>
#include "libc.h"
-#include "sys/file.h"
+#include <sys/file.h>
int
_fsync(int fd)
{
return (__fdsync(fd, FSYNC));
}
+
+int
+fdatasync(int fd)
+{
+ return (__fdsync(fd, FDSYNC));
+}
diff --git a/usr/src/lib/libc/port/sys/sigstack.c b/usr/src/lib/libc/port/sys/sigstack.c
index cf4335f2a2..9f34b2386a 100644
--- a/usr/src/lib/libc/port/sys/sigstack.c
+++ b/usr/src/lib/libc/port/sys/sigstack.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -30,6 +29,8 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+#pragma weak sigstack = _sigstack
+
#include "synonyms.h"
#include <sys/types.h>
#include <sys/ucontext.h>
diff --git a/usr/src/lib/libc/port/threads/assfail.c b/usr/src/lib/libc/port/threads/assfail.c
index e64aaa87a5..989a36923a 100644
--- a/usr/src/lib/libc/port/threads/assfail.c
+++ b/usr/src/lib/libc/port/threads/assfail.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -91,8 +91,8 @@ Abort(const char *msg)
* Write a panic message w/o grabbing any locks other than assert_lock.
* We have no idea what locks are held at this point.
*/
-void
-thr_panic(const char *why)
+static void
+common_panic(const char *head, const char *why)
{
char msg[400]; /* no panic() message in the library is this long */
ulwp_t *self;
@@ -103,7 +103,7 @@ thr_panic(const char *why)
(void) _private_lwp_mutex_lock(&assert_lock);
(void) _private_memset(msg, 0, sizeof (msg));
- (void) strcpy(msg, "*** libc thread failure: ");
+ (void) strcpy(msg, head);
len1 = strlen(msg);
len2 = strlen(why);
if (len1 + len2 >= sizeof (msg))
@@ -116,6 +116,18 @@ thr_panic(const char *why)
Abort(msg);
}
+void
+thr_panic(const char *why)
+{
+ common_panic("*** libc thread failure: ", why);
+}
+
+void
+aio_panic(const char *why)
+{
+ common_panic("*** libc aio system failure: ", why);
+}
+
/*
* Utility function for converting a long integer to a string, avoiding stdio.
* 'base' must be one of 10 or 16
@@ -370,7 +382,8 @@ thread_error(const char *msg)
* We use __assfail() because the libc __assert() calls
* gettext() which calls malloc() which grabs a mutex.
* We do everything without calling standard i/o.
- * _assfail() is an exported function, __assfail() is private to libc.
+ * assfail() and _assfail() are exported functions;
+ * __assfail() is private to libc.
*/
#pragma weak _assfail = __assfail
void
@@ -416,3 +429,17 @@ __assfail(const char *assertion, const char *filename, int line_num)
*/
Abort(buf);
}
+
+/*
+ * We define and export this version of assfail() just because libaio
+ * used to define and export it, needlessly. Now that libaio is folded
+ * into libc, we need to continue this for ABI/version reasons.
+ * We don't use "#pragma weak assfail __assfail" in order to avoid
+ * warnings from the check_fnames utility at build time for libraries
+ * that define their own version of assfail().
+ */
+void
+assfail(const char *assertion, const char *filename, int line_num)
+{
+ __assfail(assertion, filename, line_num);
+}
diff --git a/usr/src/lib/libc/port/threads/pthr_attr.c b/usr/src/lib/libc/port/threads/pthr_attr.c
index 865c573dd0..bcae664e13 100644
--- a/usr/src/lib/libc/port/threads/pthr_attr.c
+++ b/usr/src/lib/libc/port/threads/pthr_attr.c
@@ -88,7 +88,6 @@ _pthread_attr_destroy(pthread_attr_t *attr)
/*
* _pthread_attr_clone: make a copy of a pthread_attr_t.
- * This is a consolidation-private interface, for librt.
*/
int
_pthread_attr_clone(pthread_attr_t *attr, const pthread_attr_t *old_attr)
@@ -231,7 +230,7 @@ _pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate)
/*
* pthread_attr_setdaemonstate_np: sets the daemon state to DAEMON or NONDAEMON.
* PTHREAD_CREATE_DAEMON is equivalent to thr_create(THR_DAEMON).
- * For now, this is a consolidation-private interface for librt.
+ * For now, this is a private interface in libc.
*/
int
_pthread_attr_setdaemonstate_np(pthread_attr_t *attr, int daemonstate)
@@ -249,7 +248,7 @@ _pthread_attr_setdaemonstate_np(pthread_attr_t *attr, int daemonstate)
/*
* pthread_attr_getdaemonstate_np: gets the daemon state.
- * For now, this is a consolidation-private interface for librt.
+ * For now, this is a private interface in libc.
*/
int
_pthread_attr_getdaemonstate_np(const pthread_attr_t *attr, int *daemonstate)
diff --git a/usr/src/lib/libc/port/threads/pthread.c b/usr/src/lib/libc/port/threads/pthread.c
index 5838a5aff7..2215647391 100644
--- a/usr/src/lib/libc/port/threads/pthread.c
+++ b/usr/src/lib/libc/port/threads/pthread.c
@@ -84,7 +84,7 @@ _pthread_create(pthread_t *thread, const pthread_attr_t *attr,
return (EINVAL);
mapped = 1;
mappedpri = priority;
- priority = _map_rtpri_to_gp(priority);
+ priority = map_rtpri_to_gp(priority);
ASSERT(priority >= THREAD_MIN_PRIORITY &&
priority <= THREAD_MAX_PRIORITY);
}
@@ -236,7 +236,7 @@ _thread_setschedparam_main(pthread_t tid, int policy,
}
mapped = 1;
mappedprio = prio;
- prio = _map_rtpri_to_gp(prio);
+ prio = map_rtpri_to_gp(prio);
ASSERT(prio >= THREAD_MIN_PRIORITY &&
prio <= THREAD_MAX_PRIORITY);
}
diff --git a/usr/src/lib/libc/port/threads/rtsched.c b/usr/src/lib/libc/port/threads/rtsched.c
index 60d3357655..a85118dc5c 100644
--- a/usr/src/lib/libc/port/threads/rtsched.c
+++ b/usr/src/lib/libc/port/threads/rtsched.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,9 +39,6 @@
* The following variables are used for caching information
* for priocntl TS and RT scheduling classs.
*/
-struct pcclass ts_class, rt_class;
-
-static rtdpent_t *rt_dptbl; /* RT class parameter table */
static int rt_rrmin;
static int rt_rrmax;
static int rt_fifomin;
@@ -50,87 +47,6 @@ static int rt_othermin;
static int rt_othermax;
/*
- * Get the RT class parameter table
- */
-static void
-_get_rt_dptbl()
-{
- struct pcclass *pccp;
- pcadmin_t pcadmin;
- rtadmin_t rtadmin;
- size_t rtdpsize;
-
- pccp = &ts_class;
- /* get class's info */
- (void) strcpy(pccp->pcc_info.pc_clname, "TS");
- if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&(pccp->pcc_info)) < 0)
- goto out;
-
- pccp = &rt_class;
- /* get class's info */
- (void) strcpy(pccp->pcc_info.pc_clname, "RT");
- if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&(pccp->pcc_info)) < 0)
- goto out;
-
- /* get RT class dispatch table in rt_dptbl */
- pcadmin.pc_cid = rt_class.pcc_info.pc_cid;
- pcadmin.pc_cladmin = (caddr_t)&rtadmin;
- rtadmin.rt_cmd = RT_GETDPSIZE;
- if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0)
- goto out;
- rtdpsize = rtadmin.rt_ndpents * sizeof (rtdpent_t);
- if (rt_dptbl == NULL && (rt_dptbl = lmalloc(rtdpsize)) == NULL)
- goto out;
- rtadmin.rt_dpents = rt_dptbl;
- rtadmin.rt_cmd = RT_GETDPTBL;
- if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0)
- goto out;
- pccp->pcc_primin = 0;
- pccp->pcc_primax = ((rtinfo_t *)rt_class.pcc_info.pc_clinfo)->rt_maxpri;
- return;
-out:
- thr_panic("get_rt_dptbl failed");
-}
-
-/*
- * Translate RT class's user priority to global scheduling priority.
- * This is for priorities coming from librt.
- */
-pri_t
-_map_rtpri_to_gp(pri_t pri)
-{
- static mutex_t map_lock = DEFAULTMUTEX;
- static int mapped = 0;
- rtdpent_t *rtdp;
- pri_t gpri;
-
- if (!mapped) {
- lmutex_lock(&map_lock);
- if (!mapped) { /* do this only once */
- _get_rt_dptbl();
- mapped = 1;
- }
- lmutex_unlock(&map_lock);
- }
-
- /* First case is the default case, other two are seldomly taken */
- if (pri <= rt_dptbl[rt_class.pcc_primin].rt_globpri) {
- gpri = pri + rt_dptbl[rt_class.pcc_primin].rt_globpri -
- rt_class.pcc_primin;
- } else if (pri >= rt_dptbl[rt_class.pcc_primax].rt_globpri) {
- gpri = pri + rt_dptbl[rt_class.pcc_primax].rt_globpri -
- rt_class.pcc_primax;
- } else {
- gpri = rt_dptbl[rt_class.pcc_primin].rt_globpri + 1;
- for (rtdp = rt_dptbl+1; rtdp->rt_globpri < pri; ++rtdp, ++gpri)
- ;
- if (rtdp->rt_globpri > pri)
- --gpri;
- }
- return (gpri);
-}
-
-/*
* Set the RT priority/policy of a lwp/thread.
*/
int
@@ -175,30 +91,16 @@ _thrp_setlwpprio(lwpid_t lwpid, int policy, int pri)
static void
_init_rt_prio_ranges()
{
- pcinfo_t info;
-
- (void) strcpy(info.pc_clname, "RT");
- if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&info) == -1L)
- rt_fifomin = rt_rrmin = rt_fifomax = rt_rrmax = 0;
- else {
- rtinfo_t *rtinfop = (rtinfo_t *)info.pc_clinfo;
- rt_fifomin = rt_rrmin = 0;
- rt_fifomax = rt_rrmax = rtinfop->rt_maxpri;
- }
-
- (void) strcpy(info.pc_clname, "TS");
- if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&info) == -1L)
- rt_othermin = rt_othermax = 0;
- else {
- tsinfo_t *tsinfop = (tsinfo_t *)info.pc_clinfo;
- pri_t pri = tsinfop->ts_maxupri / 3;
- rt_othermin = -pri;
- rt_othermax = pri;
- }
+ rt_rrmin = sched_get_priority_min(SCHED_RR);
+ rt_rrmax = sched_get_priority_max(SCHED_RR);
+ rt_fifomin = sched_get_priority_min(SCHED_FIFO);
+ rt_fifomax = sched_get_priority_max(SCHED_FIFO);
+ rt_othermin = sched_get_priority_min(SCHED_OTHER);
+ rt_othermax = sched_get_priority_max(SCHED_OTHER);
}
/*
- * Validate priorities from librt.
+ * Validate priorities.
*/
int
_validate_rt_prio(int policy, int pri)
diff --git a/usr/src/lib/libc/port/threads/scalls.c b/usr/src/lib/libc/port/threads/scalls.c
index b3287040f1..67a2a6341f 100644
--- a/usr/src/lib/libc/port/threads/scalls.c
+++ b/usr/src/lib/libc/port/threads/scalls.c
@@ -206,7 +206,7 @@ _fork1(void)
self->ul_siginfo.si_signo = 0;
udp->pid = _private_getpid();
/* reset the library's data structures to reflect one thread */
- _postfork1_child();
+ postfork1_child();
restore_signals(self);
_postfork_child_handler();
} else {
@@ -375,8 +375,8 @@ _forkall(void)
}
/*
- * Externally-callable cancellation prologue and epilogue
- * functions, for cancellation points outside of libc.
+ * Cancellation prologue and epilogue functions,
+ * for cancellation points too complex to include here.
*/
void
_cancel_prologue(void)
@@ -504,13 +504,14 @@ __xpg4_putpmsg(int fd, const struct strbuf *ctlptr,
PERFORM(_putpmsg(fd, ctlptr, dataptr, band, flags|MSG_XPG4))
}
+#pragma weak nanosleep = _nanosleep
int
-__nanosleep(const timespec_t *rqtp, timespec_t *rmtp)
+_nanosleep(const timespec_t *rqtp, timespec_t *rmtp)
{
int error;
PROLOGUE
- error = ___nanosleep(rqtp, rmtp);
+ error = __nanosleep(rqtp, rmtp);
EPILOGUE
if (error) {
errno = error;
@@ -519,8 +520,9 @@ __nanosleep(const timespec_t *rqtp, timespec_t *rmtp)
return (0);
}
+#pragma weak clock_nanosleep = _clock_nanosleep
int
-__clock_nanosleep(clockid_t clock_id, int flags,
+_clock_nanosleep(clockid_t clock_id, int flags,
const timespec_t *rqtp, timespec_t *rmtp)
{
timespec_t reltime;
@@ -550,7 +552,7 @@ __clock_nanosleep(clockid_t clock_id, int flags,
}
restart:
PROLOGUE
- error = ___nanosleep(&reltime, rmtp);
+ error = __nanosleep(&reltime, rmtp);
EPILOGUE
if (error == 0 && clock_id == CLOCK_HIGHRES) {
/*
@@ -607,7 +609,7 @@ _sleep(unsigned int sec)
ts.tv_sec = (time_t)sec;
ts.tv_nsec = 0;
PROLOGUE
- error = ___nanosleep(&ts, &tsr);
+ error = __nanosleep(&ts, &tsr);
EPILOGUE
if (error == EINTR) {
rem = (unsigned int)tsr.tv_sec;
@@ -626,7 +628,7 @@ _usleep(useconds_t usec)
ts.tv_sec = usec / MICROSEC;
ts.tv_nsec = (long)(usec % MICROSEC) * 1000;
PROLOGUE
- (void) ___nanosleep(&ts, NULL);
+ (void) __nanosleep(&ts, NULL);
EPILOGUE
return (0);
}
@@ -634,9 +636,11 @@ _usleep(useconds_t usec)
int
close(int fildes)
{
+ extern void _aio_close(int);
extern int _close(int);
int rv;
+ _aio_close(fildes);
PERFORM(_close(fildes))
}
@@ -856,17 +860,17 @@ _pollsys(struct pollfd *fds, nfds_t nfd, const timespec_t *timeout,
return (rv);
}
+#pragma weak sigtimedwait = _sigtimedwait
int
-__sigtimedwait(const sigset_t *set, siginfo_t *infop,
- const timespec_t *timeout)
+_sigtimedwait(const sigset_t *set, siginfo_t *infop, const timespec_t *timeout)
{
- extern int ___sigtimedwait(const sigset_t *, siginfo_t *,
+ extern int __sigtimedwait(const sigset_t *, siginfo_t *,
const timespec_t *);
siginfo_t info;
int sig;
PROLOGUE
- sig = ___sigtimedwait(set, &info, timeout);
+ sig = __sigtimedwait(set, &info, timeout);
if (sig == SIGCANCEL &&
(SI_FROMKERNEL(&info) || info.si_code == SI_LWP)) {
do_sigcancel();
@@ -883,7 +887,23 @@ __sigtimedwait(const sigset_t *set, siginfo_t *infop,
int
_sigwait(sigset_t *set)
{
- return (__sigtimedwait(set, NULL, NULL));
+ return (_sigtimedwait(set, NULL, NULL));
+}
+
+#pragma weak sigwaitinfo = _sigwaitinfo
+int
+_sigwaitinfo(const sigset_t *set, siginfo_t *info)
+{
+ return (_sigtimedwait(set, info, NULL));
+}
+
+#pragma weak sigqueue = _sigqueue
+int
+_sigqueue(pid_t pid, int signo, const union sigval value)
+{
+ extern int __sigqueue(pid_t pid, int signo,
+ /* const union sigval */ void *value, int si_code, int block);
+ return (__sigqueue(pid, signo, value.sival_ptr, SI_QUEUE, 0));
}
int
diff --git a/usr/src/lib/libc/port/threads/sigaction.c b/usr/src/lib/libc/port/threads/sigaction.c
index 670598961f..101b730af3 100644
--- a/usr/src/lib/libc/port/threads/sigaction.c
+++ b/usr/src/lib/libc/port/threads/sigaction.c
@@ -28,6 +28,7 @@
#include "lint.h"
#include "thr_uberdata.h"
+#include "asyncio.h"
#include <signal.h>
#include <siginfo.h>
#include <ucontext.h>
@@ -154,6 +155,22 @@ call_user_handler(int sig, siginfo_t *sip, ucontext_t *ucp)
do_sigcancel();
goto out;
}
+ /* SIGCANCEL is ignored by default */
+ if (uact.sa_sigaction == SIG_DFL ||
+ uact.sa_sigaction == SIG_IGN)
+ goto out;
+ }
+
+ /*
+ * If this thread has been sent SIGAIOCANCEL (SIGLWP) and
+ * we are an aio worker thread, cancel the aio request.
+ */
+ if (sig == SIGAIOCANCEL) {
+ aio_worker_t *aiowp = _pthread_getspecific(_aio_key);
+
+ if (sip != NULL && sip->si_code == SI_LWP && aiowp != NULL)
+ _siglongjmp(aiowp->work_jmp_buf, 1);
+ /* SIGLWP is ignored by default */
if (uact.sa_sigaction == SIG_DFL ||
uact.sa_sigaction == SIG_IGN)
goto out;
@@ -289,10 +306,9 @@ sigacthandler(int sig, siginfo_t *sip, void *uvp)
thr_panic("sigacthandler(): __setcontext() returned");
}
-#pragma weak sigaction = _libc_sigaction
-#pragma weak _sigaction = _libc_sigaction
+#pragma weak sigaction = _sigaction
int
-_libc_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact)
+_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact)
{
ulwp_t *self = curthread;
uberdata_t *udp = self->ul_uberdata;
@@ -341,10 +357,11 @@ _libc_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact)
if (self->ul_vfork) {
if (tact.sa_sigaction != SIG_IGN)
tact.sa_sigaction = SIG_DFL;
- } else if (sig == SIGCANCEL) {
+ } else if (sig == SIGCANCEL || sig == SIGAIOCANCEL) {
/*
- * Always catch SIGCANCEL.
- * We need it for pthread_cancel() to work.
+ * Always catch these signals.
+ * We need SIGCANCEL for pthread_cancel() to work.
+ * We need SIGAIOCANCEL for aio_cancel() to work.
*/
udp->siguaction[sig].sig_uaction = tact;
if (tact.sa_sigaction == SIG_DFL ||
@@ -372,6 +389,16 @@ _libc_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact)
oact->sa_sigaction != SIG_IGN)
*oact = oaction;
+ /*
+ * We detect setting the disposition of SIGIO just to set the
+ * _sigio_enabled flag for the asynchronous i/o (aio) code.
+ */
+ if (sig == SIGIO && rv == 0 && tactp != NULL) {
+ _sigio_enabled =
+ (tactp->sa_handler != SIG_DFL &&
+ tactp->sa_handler != SIG_IGN);
+ }
+
if (!self->ul_vfork)
lmutex_unlock(&udp->siguaction[sig].sig_lock);
return (rv);
@@ -619,18 +646,22 @@ do_sigcancel()
}
/*
- * Set up the SIGCANCEL handler for threads cancellation
- * (needed only when we have more than one thread).
- * We need no locks here because we are called from
- * finish_init() while still single-threaded.
+ * Set up the SIGCANCEL handler for threads cancellation,
+ * needed only when we have more than one thread,
+ * or the SIGAIOCANCEL handler for aio cancellation,
+ * called when aio is initialized, in __uaio_init().
*/
void
-init_sigcancel()
+setup_cancelsig(int sig)
{
uberdata_t *udp = curthread->ul_uberdata;
+ mutex_t *mp = &udp->siguaction[sig].sig_lock;
struct sigaction act;
- act = udp->siguaction[SIGCANCEL].sig_uaction;
+ ASSERT(sig == SIGCANCEL || sig == SIGAIOCANCEL);
+ lmutex_lock(mp);
+ act = udp->siguaction[sig].sig_uaction;
+ lmutex_unlock(mp);
if (act.sa_sigaction == SIG_DFL ||
act.sa_sigaction == SIG_IGN)
act.sa_flags = SA_SIGINFO;
@@ -640,5 +671,5 @@ init_sigcancel()
}
act.sa_sigaction = udp->sigacthandler;
act.sa_mask = maskset;
- (void) __sigaction(SIGCANCEL, &act, NULL);
+ (void) __sigaction(sig, &act, NULL);
}
diff --git a/usr/src/lib/libc/port/threads/spawn.c b/usr/src/lib/libc/port/threads/spawn.c
index 18a6d68e0b..143db8cf49 100644
--- a/usr/src/lib/libc/port/threads/spawn.c
+++ b/usr/src/lib/libc/port/threads/spawn.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/ts.h>
#include <alloca.h>
#include <spawn.h>
+#include "rtsched.h"
#define ALL_POSIX_SPAWN_FLAGS \
(POSIX_SPAWN_RESETIDS | \
@@ -65,6 +66,8 @@ typedef struct file_attr {
int fa_newfiledes; /* new file descriptor for dup2() */
} file_attr_t;
+extern struct pcclass ts_class, rt_class;
+
extern pid_t _vfork(void);
#pragma unknown_control_flow(_vfork)
extern void *_private_memset(void *, int, size_t);
@@ -631,7 +634,10 @@ _posix_spawnattr_setflags(
* Populate ts_class and rt_class.
* We will need them in the child of vfork().
*/
- (void) _map_rtpri_to_gp(0);
+ if (rt_class.pcc_state == 0)
+ (void) get_info_by_policy(SCHED_FIFO);
+ if (ts_class.pcc_state == 0)
+ (void) get_info_by_policy(SCHED_OTHER);
}
sap->sa_psflags = flags;
diff --git a/usr/src/lib/libc/port/threads/synch.c b/usr/src/lib/libc/port/threads/synch.c
index 6856ebcc6b..9c6e918620 100644
--- a/usr/src/lib/libc/port/threads/synch.c
+++ b/usr/src/lib/libc/port/threads/synch.c
@@ -2184,6 +2184,77 @@ lmutex_unlock(mutex_t *mp)
exit_critical(self);
}
+/*
+ * For specialized code in libc, like the asynchronous i/o code,
+ * the following sig_*() locking primitives are used in order
+ * to make the code asynchronous signal safe. Signals are
+ * deferred while locks acquired by these functions are held.
+ */
+void
+sig_mutex_lock(mutex_t *mp)
+{
+ sigoff(curthread);
+ (void) _private_mutex_lock(mp);
+}
+
+void
+sig_mutex_unlock(mutex_t *mp)
+{
+ (void) _private_mutex_unlock(mp);
+ sigon(curthread);
+}
+
+int
+sig_mutex_trylock(mutex_t *mp)
+{
+ int error;
+
+ sigoff(curthread);
+ if ((error = _private_mutex_trylock(mp)) != 0)
+ sigon(curthread);
+ return (error);
+}
+
+/*
+ * sig_cond_wait() is a cancellation point.
+ */
+int
+sig_cond_wait(cond_t *cv, mutex_t *mp)
+{
+ int error;
+
+ ASSERT(curthread->ul_sigdefer != 0);
+ _private_testcancel();
+ error = _cond_wait(cv, mp);
+ if (error == EINTR && curthread->ul_cursig) {
+ sig_mutex_unlock(mp);
+ /* take the deferred signal here */
+ sig_mutex_lock(mp);
+ }
+ _private_testcancel();
+ return (error);
+}
+
+/*
+ * sig_cond_reltimedwait() is a cancellation point.
+ */
+int
+sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
+{
+ int error;
+
+ ASSERT(curthread->ul_sigdefer != 0);
+ _private_testcancel();
+ error = _cond_reltimedwait(cv, mp, ts);
+ if (error == EINTR && curthread->ul_cursig) {
+ sig_mutex_unlock(mp);
+ /* take the deferred signal here */
+ sig_mutex_lock(mp);
+ }
+ _private_testcancel();
+ return (error);
+}
+
static int
shared_mutex_held(mutex_t *mparg)
{
diff --git a/usr/src/lib/libc/port/threads/thr.c b/usr/src/lib/libc/port/threads/thr.c
index 37310cea56..2a9f9e89e1 100644
--- a/usr/src/lib/libc/port/threads/thr.c
+++ b/usr/src/lib/libc/port/threads/thr.c
@@ -1490,6 +1490,9 @@ libc_init(void)
if (self->ul_primarymap && __tnf_probe_notify != NULL)
__tnf_probe_notify();
/* PROBE_SUPPORT end */
+
+ init_sigev_thread();
+ init_aio();
}
#pragma fini(libc_fini)
@@ -1562,7 +1565,7 @@ finish_init()
/*
* Set up the SIGCANCEL handler for threads cancellation.
*/
- init_sigcancel();
+ setup_cancelsig(SIGCANCEL);
/*
* Arrange to do special things on exit --
@@ -1596,7 +1599,7 @@ mark_dead_and_buried(ulwp_t *ulwp)
* Reset our data structures to reflect one lwp.
*/
void
-_postfork1_child()
+postfork1_child()
{
ulwp_t *self = curthread;
uberdata_t *udp = self->ul_uberdata;
@@ -1668,6 +1671,15 @@ _postfork1_child()
udp->nzombies = 0;
}
trim_stack_cache(0);
+
+ /*
+ * Do post-fork1 processing for subsystems that need it.
+ */
+ postfork1_child_tpool();
+ postfork1_child_sigev_aio();
+ postfork1_child_sigev_mq();
+ postfork1_child_sigev_timer();
+ postfork1_child_aio();
}
#pragma weak thr_setprio = _thr_setprio
@@ -1761,7 +1773,7 @@ force_continue(ulwp_t *ulwp)
if (ulwp->ul_stopping) { /* he is stopping himself */
ts.tv_sec = 0; /* give him a chance to run */
ts.tv_nsec = 100000; /* 100 usecs or clock tick */
- (void) ___nanosleep(&ts, NULL);
+ (void) __nanosleep(&ts, NULL);
}
if (!ulwp->ul_stopping) /* he is running now */
break; /* so we are done */
@@ -2203,10 +2215,8 @@ _ti_bind_clear(int bindflag)
* Also, signals are deferred at thread startup until TLS constructors
* have all been called, at which time _thr_setup() calls sigon().
*
- * _sigoff() and _sigon() are external consolidation-private interfaces
- * to sigoff() and sigon(), respectively, in libc. _sigdeferred() is
- * a consolidation-private interface that returns the deferred signal
- * number, if any. These are used in libnsl, librt, and libaio.
+ * _sigoff() and _sigon() are external consolidation-private interfaces to
+ * sigoff() and sigon(), respectively, in libc. These are used in libnsl.
* Also, _sigoff() and _sigon() are called from dbx's run-time checking
* (librtc.so) to defer signals during its critical sections (not to be
* confused with libc critical sections [see exit_critical() above]).
@@ -2223,12 +2233,6 @@ _sigon(void)
sigon(curthread);
}
-int
-_sigdeferred(void)
-{
- return (curthread->ul_cursig);
-}
-
void
sigon(ulwp_t *self)
{
diff --git a/usr/src/lib/libc/port/tpool/thread_pool.c b/usr/src/lib/libc/port/tpool/thread_pool.c
new file mode 100644
index 0000000000..5042f60301
--- /dev/null
+++ b/usr/src/lib/libc/port/tpool/thread_pool.c
@@ -0,0 +1,560 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "synonyms.h"
+#include "thr_uberdata.h"
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include "thread_pool_impl.h"
+
+static mutex_t thread_pool_lock = DEFAULTMUTEX;
+static tpool_t *thread_pools = NULL;
+
+static void
+delete_pool(tpool_t *tpool)
+{
+ tpool_job_t *job;
+
+ ASSERT(tpool->tp_current == 0 && tpool->tp_active == NULL);
+
+ /*
+ * Unlink the pool from the global list of all pools.
+ */
+ lmutex_lock(&thread_pool_lock);
+ if (thread_pools == tpool)
+ thread_pools = tpool->tp_forw;
+ if (thread_pools == tpool)
+ thread_pools = NULL;
+ else {
+ tpool->tp_back->tp_forw = tpool->tp_forw;
+ tpool->tp_forw->tp_back = tpool->tp_back;
+ }
+ lmutex_unlock(&thread_pool_lock);
+
+ /*
+ * There should be no pending jobs, but just in case...
+ */
+ for (job = tpool->tp_head; job != NULL; job = tpool->tp_head) {
+ tpool->tp_head = job->tpj_next;
+ lfree(job, sizeof (*job));
+ }
+ (void) pthread_attr_destroy(&tpool->tp_attr);
+ lfree(tpool, sizeof (*tpool));
+}
+
+/*
+ * Worker thread is terminating.
+ */
+static void
+worker_cleanup(tpool_t *tpool)
+{
+ ASSERT(MUTEX_HELD(&tpool->tp_mutex));
+
+ if (--tpool->tp_current == 0 &&
+ (tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) {
+ if (tpool->tp_flags & TP_ABANDON) {
+ sig_mutex_unlock(&tpool->tp_mutex);
+ delete_pool(tpool);
+ return;
+ }
+ if (tpool->tp_flags & TP_DESTROY)
+ (void) cond_broadcast(&tpool->tp_busycv);
+ }
+ sig_mutex_unlock(&tpool->tp_mutex);
+}
+
+static void
+notify_waiters(tpool_t *tpool)
+{
+ if (tpool->tp_head == NULL && tpool->tp_active == NULL) {
+ tpool->tp_flags &= ~TP_WAIT;
+ (void) cond_broadcast(&tpool->tp_waitcv);
+ }
+}
+
+/*
+ * Called by a worker thread on return from a tpool_dispatch()d job.
+ */
+static void
+job_cleanup(tpool_t *tpool)
+{
+ pthread_t my_tid = pthread_self();
+ tpool_active_t *activep;
+ tpool_active_t **activepp;
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ /* CSTYLED */
+ for (activepp = &tpool->tp_active;; activepp = &activep->tpa_next) {
+ activep = *activepp;
+ if (activep->tpa_tid == my_tid) {
+ *activepp = activep->tpa_next;
+ break;
+ }
+ }
+ if (tpool->tp_flags & TP_WAIT)
+ notify_waiters(tpool);
+}
+
+static void *
+tpool_worker(void *arg)
+{
+ tpool_t *tpool = (tpool_t *)arg;
+ int elapsed;
+ tpool_job_t *job;
+ void (*func)(void *);
+ tpool_active_t active;
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ pthread_cleanup_push(worker_cleanup, tpool);
+
+ /*
+ * This is the worker's main loop.
+ * It will only be left if a timeout or an error has occured.
+ */
+ active.tpa_tid = pthread_self();
+ for (;;) {
+ elapsed = 0;
+ tpool->tp_idle++;
+ if (tpool->tp_flags & TP_WAIT)
+ notify_waiters(tpool);
+ while ((tpool->tp_head == NULL ||
+ (tpool->tp_flags & TP_SUSPEND)) &&
+ !(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) {
+ if (tpool->tp_current <= tpool->tp_minimum ||
+ tpool->tp_linger == 0) {
+ (void) sig_cond_wait(&tpool->tp_workcv,
+ &tpool->tp_mutex);
+ } else {
+ timestruc_t timeout;
+
+ timeout.tv_sec = tpool->tp_linger;
+ timeout.tv_nsec = 0;
+ if (sig_cond_reltimedwait(&tpool->tp_workcv,
+ &tpool->tp_mutex, &timeout) != 0) {
+ elapsed = 1;
+ break;
+ }
+ }
+ }
+ tpool->tp_idle--;
+ if (tpool->tp_flags & TP_DESTROY)
+ break;
+ if (tpool->tp_flags & TP_ABANDON) {
+ /* can't abandon a suspended pool */
+ if (tpool->tp_flags & TP_SUSPEND) {
+ tpool->tp_flags &= ~TP_SUSPEND;
+ (void) cond_broadcast(&tpool->tp_workcv);
+ }
+ if (tpool->tp_head == NULL)
+ break;
+ }
+ if ((job = tpool->tp_head) != NULL &&
+ !(tpool->tp_flags & TP_SUSPEND)) {
+ elapsed = 0;
+ func = job->tpj_func;
+ arg = job->tpj_arg;
+ tpool->tp_head = job->tpj_next;
+ if (job == tpool->tp_tail)
+ tpool->tp_tail = NULL;
+ tpool->tp_njobs--;
+ active.tpa_next = tpool->tp_active;
+ tpool->tp_active = &active;
+ sig_mutex_unlock(&tpool->tp_mutex);
+ pthread_cleanup_push(job_cleanup, tpool);
+ lfree(job, sizeof (*job));
+ /*
+ * Call the specified function.
+ */
+ func(arg);
+ /*
+ * We don't know what this thread has been doing,
+ * so we reset its signal mask and cancellation
+ * state back to the initial values.
+ */
+ (void) pthread_sigmask(SIG_SETMASK, &maskset, NULL);
+ (void) pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED,
+ NULL);
+ (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,
+ NULL);
+ pthread_cleanup_pop(1);
+ }
+ if (elapsed && tpool->tp_current > tpool->tp_minimum) {
+ /*
+ * We timed out and there is no work to be done
+ * and the number of workers exceeds the minimum.
+ * Exit now to reduce the size of the pool.
+ */
+ break;
+ }
+ }
+ pthread_cleanup_pop(1);
+ return (arg);
+}
+
+/*
+ * Create a worker thread, with all signals blocked.
+ */
+static int
+create_worker(tpool_t *tpool)
+{
+ sigset_t oset;
+ int error;
+
+ (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
+ error = pthread_create(NULL, &tpool->tp_attr, tpool_worker, tpool);
+ (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
+ return (error);
+}
+
+tpool_t *
+tpool_create(uint_t min_threads, uint_t max_threads, uint_t linger,
+ pthread_attr_t *attr)
+{
+ tpool_t *tpool;
+ void *stackaddr;
+ size_t stacksize;
+ size_t minstack;
+ int error;
+
+ if (min_threads > max_threads || max_threads < 1) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ if (attr != NULL) {
+ if (pthread_attr_getstack(attr, &stackaddr, &stacksize) != 0) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ /*
+ * Allow only one thread in the pool with a specified stack.
+ * Require threads to have at least the minimum stack size.
+ */
+ minstack = thr_min_stack();
+ if (stackaddr != NULL) {
+ if (stacksize < minstack || max_threads != 1) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ } else if (stacksize != 0 && stacksize < minstack) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ }
+
+ tpool = lmalloc(sizeof (*tpool));
+ if (tpool == NULL) {
+ errno = ENOMEM;
+ return (NULL);
+ }
+ (void) mutex_init(&tpool->tp_mutex, USYNC_THREAD, NULL);
+ (void) cond_init(&tpool->tp_busycv, USYNC_THREAD, NULL);
+ (void) cond_init(&tpool->tp_workcv, USYNC_THREAD, NULL);
+ (void) cond_init(&tpool->tp_waitcv, USYNC_THREAD, NULL);
+ tpool->tp_minimum = min_threads;
+ tpool->tp_maximum = max_threads;
+ tpool->tp_linger = linger;
+
+ /*
+ * We cannot just copy the attribute pointer.
+ * We need to initialize a new pthread_attr_t structure
+ * with the values from the user-supplied pthread_attr_t.
+ * If the attribute pointer is NULL, we need to initialize
+ * the new pthread_attr_t structure with default values.
+ */
+ error = _pthread_attr_clone(&tpool->tp_attr, attr);
+ if (error) {
+ lfree(tpool, sizeof (*tpool));
+ errno = error;
+ return (NULL);
+ }
+
+ /* make all pool threads be detached daemon threads */
+ (void) pthread_attr_setdetachstate(&tpool->tp_attr,
+ PTHREAD_CREATE_DETACHED);
+ (void) _pthread_attr_setdaemonstate_np(&tpool->tp_attr,
+ PTHREAD_CREATE_DAEMON_NP);
+
+ /* insert into the global list of all thread pools */
+ lmutex_lock(&thread_pool_lock);
+ if (thread_pools == NULL) {
+ tpool->tp_forw = tpool;
+ tpool->tp_back = tpool;
+ thread_pools = tpool;
+ } else {
+ thread_pools->tp_back->tp_forw = tpool;
+ tpool->tp_forw = thread_pools;
+ tpool->tp_back = thread_pools->tp_back;
+ thread_pools->tp_back = tpool;
+ }
+ lmutex_unlock(&thread_pool_lock);
+
+ return (tpool);
+}
+
+/*
+ * Dispatch a work request to the thread pool.
+ * If there are idle workers, awaken one.
+ * Else, if the maximum number of workers has
+ * not been reached, spawn a new worker thread.
+ * Else just return with the job added to the queue.
+ */
+int
+tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg)
+{
+ tpool_job_t *job;
+
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ if ((job = lmalloc(sizeof (*job))) == NULL)
+ return (-1);
+ job->tpj_next = NULL;
+ job->tpj_func = func;
+ job->tpj_arg = arg;
+
+ sig_mutex_lock(&tpool->tp_mutex);
+
+ if (tpool->tp_head == NULL)
+ tpool->tp_head = job;
+ else
+ tpool->tp_tail->tpj_next = job;
+ tpool->tp_tail = job;
+ tpool->tp_njobs++;
+
+ if (!(tpool->tp_flags & TP_SUSPEND)) {
+ if (tpool->tp_idle > 0)
+ (void) cond_signal(&tpool->tp_workcv);
+ else if (tpool->tp_current < tpool->tp_maximum &&
+ create_worker(tpool) == 0)
+ tpool->tp_current++;
+ }
+
+ sig_mutex_unlock(&tpool->tp_mutex);
+ return (0);
+}
+
+/*
+ * Assumes: by the time tpool_destroy() is called no one will use this
+ * thread pool in any way and no one will try to dispatch entries to it.
+ * Calling tpool_destroy() from a job in the pool will cause deadlock.
+ */
+void
+tpool_destroy(tpool_t *tpool)
+{
+ tpool_active_t *activep;
+
+ ASSERT(!tpool_member(tpool));
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ pthread_cleanup_push(sig_mutex_unlock, &tpool->tp_mutex);
+
+ /* mark the pool as being destroyed; wakeup idle workers */
+ tpool->tp_flags |= TP_DESTROY;
+ tpool->tp_flags &= ~TP_SUSPEND;
+ (void) cond_broadcast(&tpool->tp_workcv);
+
+ /* cancel all active workers */
+ for (activep = tpool->tp_active; activep; activep = activep->tpa_next)
+ (void) pthread_cancel(activep->tpa_tid);
+
+ /* wait for all active workers to finish */
+ while (tpool->tp_active != NULL) {
+ tpool->tp_flags |= TP_WAIT;
+ (void) sig_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex);
+ }
+
+ /* the last worker to terminate will wake us up */
+ while (tpool->tp_current != 0)
+ (void) sig_cond_wait(&tpool->tp_busycv, &tpool->tp_mutex);
+
+ pthread_cleanup_pop(1); /* sig_mutex_unlock(&tpool->tp_mutex); */
+ delete_pool(tpool);
+}
+
+/*
+ * Like tpool_destroy(), but don't cancel workers or wait for them to finish.
+ * The last worker to terminate will delete the pool.
+ */
+void
+tpool_abandon(tpool_t *tpool)
+{
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ if (tpool->tp_current == 0) {
+ /* no workers, just delete the pool */
+ sig_mutex_unlock(&tpool->tp_mutex);
+ delete_pool(tpool);
+ } else {
+ /* wake up all workers, last one will delete the pool */
+ tpool->tp_flags |= TP_ABANDON;
+ tpool->tp_flags &= ~TP_SUSPEND;
+ (void) cond_broadcast(&tpool->tp_workcv);
+ sig_mutex_unlock(&tpool->tp_mutex);
+ }
+}
+
+/*
+ * Wait for all jobs to complete.
+ * Calling tpool_wait() from a job in the pool will cause deadlock.
+ */
+void
+tpool_wait(tpool_t *tpool)
+{
+ ASSERT(!tpool_member(tpool));
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ pthread_cleanup_push(sig_mutex_unlock, &tpool->tp_mutex);
+ while (tpool->tp_head != NULL || tpool->tp_active != NULL) {
+ tpool->tp_flags |= TP_WAIT;
+ (void) sig_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex);
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+ }
+ pthread_cleanup_pop(1); /* sig_mutex_unlock(&tpool->tp_mutex); */
+}
+
+void
+tpool_suspend(tpool_t *tpool)
+{
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ tpool->tp_flags |= TP_SUSPEND;
+ sig_mutex_unlock(&tpool->tp_mutex);
+}
+
+int
+tpool_suspended(tpool_t *tpool)
+{
+ int suspended;
+
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ suspended = (tpool->tp_flags & TP_SUSPEND) != 0;
+ sig_mutex_unlock(&tpool->tp_mutex);
+
+ return (suspended);
+}
+
+void
+tpool_resume(tpool_t *tpool)
+{
+ int excess;
+
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ if (!(tpool->tp_flags & TP_SUSPEND)) {
+ sig_mutex_unlock(&tpool->tp_mutex);
+ return;
+ }
+ tpool->tp_flags &= ~TP_SUSPEND;
+ (void) cond_broadcast(&tpool->tp_workcv);
+ excess = tpool->tp_njobs - tpool->tp_idle;
+ while (excess-- > 0 && tpool->tp_current < tpool->tp_maximum) {
+ if (create_worker(tpool) != 0)
+ break; /* pthread_create() failed */
+ tpool->tp_current++;
+ }
+ sig_mutex_unlock(&tpool->tp_mutex);
+}
+
+int
+tpool_member(tpool_t *tpool)
+{
+ pthread_t my_tid = pthread_self();
+ tpool_active_t *activep;
+
+ ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON)));
+
+ sig_mutex_lock(&tpool->tp_mutex);
+ for (activep = tpool->tp_active; activep; activep = activep->tpa_next) {
+ if (activep->tpa_tid == my_tid) {
+ sig_mutex_unlock(&tpool->tp_mutex);
+ return (1);
+ }
+ }
+ sig_mutex_unlock(&tpool->tp_mutex);
+ return (0);
+}
+
+void
+postfork1_child_tpool(void)
+{
+ pthread_t my_tid = pthread_self();
+ tpool_t *tpool;
+ tpool_job_t *job;
+
+ /*
+ * All of the thread pool workers are gone, except possibly
+ * for the current thread, if it is a thread pool worker thread.
+ * Retain the thread pools, but make them all empty. Whatever
+ * jobs were queued or running belong to the parent process.
+ */
+top:
+ if ((tpool = thread_pools) == NULL)
+ return;
+
+ do {
+ tpool_active_t *activep;
+
+ (void) mutex_init(&tpool->tp_mutex, USYNC_THREAD, NULL);
+ (void) cond_init(&tpool->tp_busycv, USYNC_THREAD, NULL);
+ (void) cond_init(&tpool->tp_workcv, USYNC_THREAD, NULL);
+ (void) cond_init(&tpool->tp_waitcv, USYNC_THREAD, NULL);
+ for (job = tpool->tp_head; job; job = tpool->tp_head) {
+ tpool->tp_head = job->tpj_next;
+ lfree(job, sizeof (*job));
+ }
+ tpool->tp_tail = NULL;
+ tpool->tp_njobs = 0;
+ for (activep = tpool->tp_active; activep;
+ activep = activep->tpa_next) {
+ if (activep->tpa_tid == my_tid) {
+ activep->tpa_next = NULL;
+ break;
+ }
+ }
+ tpool->tp_idle = 0;
+ tpool->tp_current = 0;
+ if ((tpool->tp_active = activep) != NULL)
+ tpool->tp_current = 1;
+ tpool->tp_flags &= ~TP_WAIT;
+ if (tpool->tp_flags & (TP_DESTROY | TP_ABANDON)) {
+ tpool->tp_flags &= ~TP_DESTROY;
+ tpool->tp_flags |= TP_ABANDON;
+ if (tpool->tp_current == 0) {
+ delete_pool(tpool);
+ goto top; /* start over */
+ }
+ }
+ } while ((tpool = tpool->tp_forw) != thread_pools);
+}
diff --git a/usr/src/lib/libc/port/tpool/thread_pool_impl.h b/usr/src/lib/libc/port/tpool/thread_pool_impl.h
new file mode 100644
index 0000000000..66611778a0
--- /dev/null
+++ b/usr/src/lib/libc/port/tpool/thread_pool_impl.h
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _THREAD_POOL_IMPL_H
+#define _THREAD_POOL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <thread_pool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Thread pool implementation definitions.
+ * See <thread_pool.h> for interface declarations.
+ */
+
+/*
+ * FIFO queued job
+ */
+typedef struct tpool_job tpool_job_t;
+struct tpool_job {
+ tpool_job_t *tpj_next; /* list of jobs */
+ void (*tpj_func)(void *); /* function to call */
+ void *tpj_arg; /* its argument */
+};
+
+/*
+ * List of active threads, linked through their stacks.
+ */
+typedef struct tpool_active tpool_active_t;
+struct tpool_active {
+ tpool_active_t *tpa_next; /* list of active threads */
+ pthread_t tpa_tid; /* active thread id */
+};
+
+/*
+ * The thread pool.
+ */
+struct tpool {
+ tpool_t *tp_forw; /* circular list of all thread pools */
+ tpool_t *tp_back;
+ mutex_t tp_mutex; /* protects the pool data */
+ cond_t tp_busycv; /* synchronization in tpool_dispatch */
+ cond_t tp_workcv; /* synchronization with workers */
+ cond_t tp_waitcv; /* synchronization in tpool_wait() */
+ tpool_active_t *tp_active; /* threads performing work */
+ tpool_job_t *tp_head; /* FIFO job queue */
+ tpool_job_t *tp_tail;
+ pthread_attr_t tp_attr; /* attributes of the workers */
+ int tp_flags; /* see below */
+ uint_t tp_linger; /* seconds before idle workers exit */
+ int tp_njobs; /* number of jobs in job queue */
+ int tp_minimum; /* minimum number of worker threads */
+ int tp_maximum; /* maximum number of worker threads */
+ int tp_current; /* current number of worker threads */
+ int tp_idle; /* number of idle workers */
+};
+
+/* tp_flags */
+#define TP_WAIT 0x01 /* waiting in tpool_wait() */
+#define TP_SUSPEND 0x02 /* pool is being suspended */
+#define TP_DESTROY 0x04 /* pool is being destroyed */
+#define TP_ABANDON 0x08 /* pool is abandoned (auto-destroy) */
+
+extern int _pthread_attr_clone(pthread_attr_t *, const pthread_attr_t *);
+
+extern const sigset_t maskset; /* set of all maskable signals */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _THREAD_POOL_IMPL_H */
diff --git a/usr/src/lib/libc/sparc/Makefile b/usr/src/lib/libc/sparc/Makefile
index 7ce71b3756..50fa5f8c45 100644
--- a/usr/src/lib/libc/sparc/Makefile
+++ b/usr/src/lib/libc/sparc/Makefile
@@ -777,6 +777,24 @@ PORTI18N_COND= \
wcstol_longlong.o \
wcstoul_longlong.o
+AIOOBJS= \
+ aio.o \
+ aio_alloc.o \
+ posix_aio.o \
+
+RTOBJS= \
+ clock_timer.o \
+ fallocate.o \
+ mqueue.o \
+ pos4obj.o \
+ sched.o \
+ sem.o \
+ shm.o \
+ sigev_thread.o
+
+TPOOLOBJS= \
+ thread_pool.o
+
THREADSOBJS= \
alloc.o \
assfail.o \
@@ -899,6 +917,9 @@ MOSTOBJS= \
$(PORTSTDIO_W) \
$(PORTSYS) \
$(PORTSYS64) \
+ $(AIOOBJS) \
+ $(RTOBJS) \
+ $(TPOOLOBJS) \
$(THREADSOBJS) \
$(THREADSMACHOBJS) \
$(THREADSASMOBJS) \
@@ -1004,6 +1025,9 @@ SRCS= \
$(PORTREGEX:%.o=../port/regex/%.c) \
$(PORTSTDIO:%.o=../port/stdio/%.c) \
$(PORTSYS:%.o=../port/sys/%.c) \
+ $(AIOOBJS:%.o=../port/aio/%.c) \
+ $(RTOBJS:%.o=../port/rt/%.c) \
+ $(TPOOLOBJS:%.o=../port/tpool/%.c) \
$(THREADSOBJS:%.o=../port/threads/%.c) \
$(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \
$(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \
@@ -1033,6 +1057,7 @@ $(MAPFILE):
# Files which need the threads .il inline template
TIL= \
+ aio.o \
alloc.o \
assfail.o \
atexit.o \
@@ -1042,7 +1067,9 @@ TIL= \
errno.o \
getctxt.o \
lwp.o \
+ ma.o \
machdep.o \
+ posix_aio.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
@@ -1055,6 +1082,7 @@ TIL= \
scalls.o \
sema.o \
sigaction.o \
+ sigev_thread.o \
spawn.o \
stack.o \
swapctxt.o \
@@ -1062,6 +1090,7 @@ TIL= \
tdb_agent.o \
thr.o \
thread_interface.o \
+ thread_pool.o \
tls.o \
tsd.o \
unwind.o
diff --git a/usr/src/lib/libc/sparcv9/Makefile b/usr/src/lib/libc/sparcv9/Makefile
index e5810b8bd2..3918386307 100644
--- a/usr/src/lib/libc/sparcv9/Makefile
+++ b/usr/src/lib/libc/sparcv9/Makefile
@@ -725,6 +725,24 @@ PORTI18N_COND= \
wcstol_longlong.o \
wcstoul_longlong.o
+AIOOBJS= \
+ aio.o \
+ aio_alloc.o \
+ posix_aio.o \
+
+RTOBJS= \
+ clock_timer.o \
+ fallocate.o \
+ mqueue.o \
+ pos4obj.o \
+ sched.o \
+ sem.o \
+ shm.o \
+ sigev_thread.o
+
+TPOOLOBJS= \
+ thread_pool.o
+
THREADSOBJS= \
alloc.o \
assfail.o \
@@ -844,6 +862,9 @@ MOSTOBJS= \
$(PORTSTDIO_W) \
$(PORTSYS) \
$(PORTSYS64) \
+ $(AIOOBJS) \
+ $(RTOBJS) \
+ $(TPOOLOBJS) \
$(THREADSOBJS) \
$(THREADSMACHOBJS) \
$(THREADSASMOBJS) \
@@ -949,6 +970,9 @@ SRCS= \
$(PORTREGEX:%.o=../port/regex/%.c) \
$(PORTSTDIO:%.o=../port/stdio/%.c) \
$(PORTSYS:%.o=../port/sys/%.c) \
+ $(AIOOBJS:%.o=../port/aio/%.c) \
+ $(RTOBJS:%.o=../port/rt/%.c) \
+ $(TPOOLOBJS:%.o=../port/tpool/%.c) \
$(THREADSOBJS:%.o=../port/threads/%.c) \
$(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \
$(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \
@@ -977,6 +1001,7 @@ $(MAPFILE):
# Files which need the threads .il inline template
TIL= \
+ aio.o \
alloc.o \
assfail.o \
atexit.o \
@@ -986,7 +1011,9 @@ TIL= \
errno.o \
getctxt.o \
lwp.o \
+ ma.o \
machdep.o \
+ posix_aio.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
@@ -999,6 +1026,7 @@ TIL= \
scalls.o \
sema.o \
sigaction.o \
+ sigev_thread.o \
spawn.o \
stack.o \
swapctxt.o \
@@ -1006,6 +1034,7 @@ TIL= \
tdb_agent.o \
thr.o \
thread_interface.o \
+ thread_pool.o \
tls.o \
tsd.o \
unwind.o
diff --git a/usr/src/lib/libc/spec/Makefile.targ b/usr/src/lib/libc/spec/Makefile.targ
index 7b8d73ce11..4243823247 100644
--- a/usr/src/lib/libc/spec/Makefile.targ
+++ b/usr/src/lib/libc/spec/Makefile.targ
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
@@ -30,7 +29,8 @@
LIBRARY = libc.a
VERS = .1
-OBJECTS = atomic.o \
+OBJECTS = aio.o \
+ atomic.o \
data.o \
door.o \
fmtmsg.o \
@@ -43,6 +43,7 @@ OBJECTS = atomic.o \
private.o \
privatedata.o \
regex.o \
+ rt.o \
stdio.o \
sys.o \
threads.o \
diff --git a/usr/src/lib/libc/spec/aio.spec b/usr/src/lib/libc/spec/aio.spec
new file mode 100644
index 0000000000..6b2612210e
--- /dev/null
+++ b/usr/src/lib/libc/spec/aio.spec
@@ -0,0 +1,83 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+function aiocancel
+include <sys/asynch.h>, <aio.h>
+declaration int aiocancel(aio_result_t *resultp)
+version SUNW_1.23
+errno EACCES EFAULT EINVAL
+exception $return == -1
+end
+
+function aioread
+include <sys/types.h>, <sys/asynch.h>, <aio.h>
+declaration int aioread(int fildes, char *bufp, int bufs, \
+ off_t offset, int whence, aio_result_t *resultp)
+version SUNW_1.23
+errno EAGAIN EBADF EFAULT EINVAL ENOMEM
+exception $return == -1
+end
+
+function aioread64
+declaration int aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, \
+ int whence, aio_result_t *resultp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aiowait
+include <sys/asynch.h>, <aio.h>, <sys/time.h>
+declaration aio_result_t *aiowait(struct timeval *timeout)
+version SUNW_1.23
+errno EFAULT EINTR EINVAL
+exception $return == (aio_result_t *)-1
+end
+
+function aiowrite
+include <sys/types.h>, <sys/asynch.h>, <aio.h>
+declaration int aiowrite(int fildes, char *bufp, int bufs, \
+ off_t offset, int whence, aio_result_t *resultp)
+version SUNW_1.23
+errno EAGAIN EBADF EFAULT EINVAL ENOMEM
+exception $return == -1
+end
+
+function aiowrite64
+include <sys/types.h>, <sys/asynch.h>, <aio.h>
+declaration int aiowrite64(int fildes, char *bufp, int bufs, \
+ off64_t offset, int whence, aio_result_t *resultp)
+arch sparc i386
+version SUNW_1.23
+errno EAGAIN EBADF EFAULT EINVAL ENOMEM
+exception $return == -1
+end
+
+function assfail
+declaration int assfail(char *a, char *f, int l)
+version SUNW_1.23
+end
+
diff --git a/usr/src/lib/libc/spec/gen.spec b/usr/src/lib/libc/spec/gen.spec
index 9c547e1a37..2b14689e7a 100644
--- a/usr/src/lib/libc/spec/gen.spec
+++ b/usr/src/lib/libc/spec/gen.spec
@@ -1,7 +1,4 @@
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
-#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
@@ -21,6 +18,10 @@
#
# CDDL HEADER END
#
+#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
# ident "%Z%%M% %I% %E% SMI"
#
@@ -3281,6 +3282,11 @@ errno EPERM
exception $return == -1
end
+function _sigstack
+weak sigstack
+version SUNWprivate_1.1
+end
+
function sleep
include <unistd.h>
declaration unsigned sleep(unsigned seconds)
@@ -4842,19 +4848,6 @@ weak port_alert
version SUNWprivate_1.1
end
-function port_dispatch
-include <port.h>
-declaration int port_dispatch(int port, int flags, int source, int events, \
- uintptr_t object, void *user)
-version SUNWprivate_1.1
-errno EBADF EBADFD EINTR
-end
-
-function _port_dispatch
-weak port_dispatch
-version SUNWprivate_1.1
-end
-
function ucred_size
include <ucred.h>
declaration size_t ucred_size(void)
diff --git a/usr/src/lib/libc/spec/private.spec b/usr/src/lib/libc/spec/private.spec
index 2e26e10c8b..9868be02cf 100644
--- a/usr/src/lib/libc/spec/private.spec
+++ b/usr/src/lib/libc/spec/private.spec
@@ -41,26 +41,6 @@ function __class_quadruple # used by Sun's old Fortran 77 runtime libraries
version SUNWprivate_1.1
end
-function __clock_getres
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __clock_gettime
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __clock_nanosleep
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __clock_settime
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function __collate_init
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -82,11 +62,6 @@ function __eucpctowc_gen
version SUNWprivate_1.1
end
-function __fdsync
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function __fgetwc_dense
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -319,11 +294,6 @@ function __multi_innetgr
version SUNWprivate_1.1
end
-function __nanosleep
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function __nl_langinfo_std
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -364,21 +334,6 @@ function __regfree_std
version SUNWprivate_1.1
end
-function __signotify
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __sigqueue
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __sigtimedwait
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function __strcoll_C
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -436,31 +391,6 @@ function __time_init
version SUNWprivate_1.1
end
-function __timer_create
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __timer_delete
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __timer_getoverrun
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __timer_gettime
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function __timer_settime
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function __towctrans_bc
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -1376,11 +1306,6 @@ weak jrand48
version SUNWprivate_1.1
end
-function _kaio
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function _l64a # extends libc/spec/gen.spec l64a
weak l64a
#Declaration /* Unknown. */
@@ -1744,16 +1669,6 @@ weak pthread_atfork
version SUNWprivate_1.1
end
-function _pthread_attr_clone
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
-function _pthread_attr_equal
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function _pthread_attr_destroy
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -1764,11 +1679,6 @@ function _pthread_attr_getdetachstate
version SUNWprivate_1.1
end
-function _pthread_attr_getdaemonstate_np
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function _pthread_attr_getinheritsched
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -1814,11 +1724,6 @@ function _pthread_attr_setdetachstate
version SUNWprivate_1.1
end
-function _pthread_attr_setdaemonstate_np
-#Declaration /* Unknown. */
-version SUNWprivate_1.1
-end
-
function _pthread_attr_setinheritsched
#Declaration /* Unknown. */
version SUNWprivate_1.1
@@ -2992,10 +2897,6 @@ arch sparc sparcv9
version SUNWprivate_1.1
end
-function kaio
-version SUNWprivate_1.1
-end
-
function makeut
version SUNWprivate_1.1
end
diff --git a/usr/src/lib/libc/spec/rt.spec b/usr/src/lib/libc/spec/rt.spec
new file mode 100644
index 0000000000..52de0469bd
--- /dev/null
+++ b/usr/src/lib/libc/spec/rt.spec
@@ -0,0 +1,641 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+function aio_cancel
+include <aio.h>
+declaration int aio_cancel(int fildes, struct aiocb *aiocbp)
+version SUNW_1.23
+errno EBADF ENOSYS
+end
+
+function aio_fsync
+include <aio.h>
+declaration int aio_fsync(int op, aiocb_t *aiocbp)
+version SUNW_1.23
+errno EAGAIN EBADF EINVAL ENOSYS
+end
+
+function aio_read
+include <aio.h>
+declaration int aio_read(struct aiocb *aiocbp)
+version SUNW_1.23
+errno EAGAIN ENOSYS EBADF EINVAL ECANCELED EFBIG
+end
+
+function aio_write
+include <aio.h>
+declaration int aio_write(struct aiocb *aiocbp)
+version SUNW_1.23
+errno EAGAIN ENOSYS EBADF EINVAL ECANCELED EFBIG
+end
+
+function aio_return
+include <aio.h>
+declaration ssize_t aio_return(struct aiocb * aiocbp)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function aio_error
+include <aio.h>
+declaration int aio_error(const struct aiocb *aiocbp)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function aio_suspend
+include <aio.h>
+declaration int aio_suspend(const struct aiocb *const list[], int nent, \
+ const struct timespec *timeout)
+version SUNW_1.23
+errno EAGAIN EINTR ENOSYS
+end
+
+function posix_fallocate
+include <fcntl.h>
+declaration int posix_fallocate(int fd, off_t offset, off_t len)
+version SUNW_1.23
+errno EBADF EFBIG EINTR EINVAL EIO ENODEV ENOSPC ESPIPE
+end
+
+function fdatasync
+include <unistd.h>
+declaration int fdatasync(int fildes)
+version SUNW_1.23
+errno EBADF EINVAL ENOSYS
+end
+
+function lio_listio
+include <aio.h>
+declaration int lio_listio(int mode, struct aiocb *const list[], int nent, \
+ struct sigevent *sig)
+version SUNW_1.23
+errno EAGAIN EINVAL EINTR EIO ENOSYS ECANCELED \
+ EINPROGRESS EOVERFLOW EFBIG
+end
+
+function aio_waitn
+include <aio.h>
+declaration int aio_waitn(struct aiocb *list[], uint_t nent, \
+ uint_t *nwait, const struct timespec *timeout)
+version SUNW_1.23
+errno EAGAIN EINTR ETIME ENOMEM EFAULT EINVAL
+end
+
+function aio_cancel64 extends libc/spec/rt.spec aio_cancel
+declaration int aio_cancel64(int fildes, struct aiocb64 *aiocbp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_error64 extends libc/spec/rt.spec aio_error
+declaration int aio_error64(const struct aiocb64 *aiocbp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_fsync64 extends libc/spec/rt.spec aio_fsync
+declaration int aio_fsync64(int op, struct aiocb64 *aiocbp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_read64 extends libc/spec/rt.spec aio_read
+declaration int aio_read64(struct aiocb64 *aiocbp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_return64 extends libc/spec/rt.spec aio_return
+declaration ssize_t aio_return64(struct aiocb64 * aiocbp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_suspend64 extends libc/spec/rt.spec aio_suspend
+declaration int aio_suspend64(const struct aiocb64 *const list[], \
+ int nent, const struct timespec *timeout)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_write64 extends libc/spec/rt.spec aio_write
+declaration int aio_write64(struct aiocb64 *aiocbp)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function lio_listio64 extends libc/spec/rt.spec lio_listio
+declaration int lio_listio64(int mode, struct aiocb64 *const list[], \
+ int nent, struct sigevent *sig)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function aio_waitn64 extends libc/spec/rt.spec aio_waitn
+declaration int aio_waitn64(struct aiocb64 *list[], uint_t nent, \
+ uint_t *nwait, const struct timespec *timeout)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function posix_fallocate64 extends libc/spec/rt.spec posix_fallocate
+declaration int posix_fallocate64(int fd, off64_t offset, off64_t len)
+arch i386 sparc
+version SUNW_1.23
+end
+
+function mq_close
+include <mqueue.h>
+declaration int mq_close(mqd_t mqdes)
+version SUNW_1.23
+errno EBADF ENOSYS
+exception $return == -1
+end
+
+function mq_notify
+include <mqueue.h>
+declaration int mq_notify(mqd_t mqdes, const struct sigevent *notification)
+version SUNW_1.23
+errno EBADF EBUSY ENOSYS
+exception $return == -1
+end
+
+function mq_open
+include <mqueue.h>
+declaration mqd_t mq_open(const char *name, int oflag, ...)
+version SUNW_1.23
+errno EACCESS EEXIST EINTR EINVAL EMFILE ENAMETOOLONG ENFILE \
+ ENOENT ENOSPC ENOSYS
+exception $return == (mqd_t)(-1)
+end
+
+function mq_receive
+include <mqueue.h>
+declaration ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, \
+ size_t msg_len, unsigned int *msg_prio)
+version SUNW_1.23
+errno EAGAIN EBADF EMSGSIZE EINTR
+exception $return == (ssize_t)(-1)
+end
+
+function mq_timedreceive
+include <mqueue.h>, <time.h>
+declaration ssize_t mq_timedreceive(mqd_t mqdes, char *msg_ptr, \
+ size_t msg_len, unsigned int *msg_prio, \
+ const struct timespec *abs_timeout)
+version SUNW_1.23
+errno EAGAIN EBADF EMSGSIZE EINTR ETIMEDOUT
+exception $return == (ssize_t)(-1)
+end
+
+function mq_reltimedreceive_np
+include <mqueue.h>, <time.h>
+declaration ssize_t mq_reltimedreceive_np(mqd_t mqdes, char *msg_ptr, \
+ size_t msg_len, unsigned int *msg_prio, \
+ const struct timespec *rel_timeout)
+version SUNW_1.23
+errno EAGAIN EBADF EMSGSIZE EINTR ETIMEDOUT
+exception $return == (ssize_t)(-1)
+end
+
+function mq_send
+include <mqueue.h>
+declaration int mq_send(mqd_t mqdes, const char *msg_ptr, \
+ size_t msg_len, unsigned int msg_prio)
+version SUNW_1.23
+errno EAGAIN EBADF EINTR EMSGSIZE
+exception $return == -1
+end
+
+function mq_timedsend
+include <mqueue.h>, <time.h>
+declaration int mq_timedsend(mqd_t mqdes, const char *msg_ptr, \
+ size_t msg_len, unsigned int msg_prio, \
+ const struct timespec *abs_timeout)
+version SUNW_1.23
+errno EAGAIN EBADF EINTR EMSGSIZE ETIMEDOUT
+exception $return == -1
+end
+
+function mq_reltimedsend_np
+include <mqueue.h>, <time.h>
+declaration int mq_reltimedsend_np(mqd_t mqdes, const char *msg_ptr, \
+ size_t msg_len, unsigned int msg_prio, \
+ const struct timespec *rel_timeout)
+version SUNW_1.23
+errno EAGAIN EBADF EINTR EMSGSIZE ETIMEDOUT
+exception $return == -1
+end
+
+function mq_setattr
+include <mqueue.h>
+declaration int mq_setattr(mqd_t mqdes, \
+ const struct mq_attr *_RESTRICT_KYWD mqstat, \
+ struct mq_attr *_RESTRICT_KYWD omqstat)
+version SUNW_1.23
+errno EBADF ENOSYS
+exception $return == -1
+end
+
+function mq_getattr
+include <mqueue.h>
+declaration int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat)
+version SUNW_1.23
+errno EBADF ENOSYS
+exception $return == -1
+end
+
+function mq_unlink
+include <mqueue.h>
+declaration int mq_unlink(const char *name)
+version SUNW_1.23
+errno EACCESS ENAMETOOLONG ENOENT ENOSYS
+exception $return == -1
+end
+
+function nanosleep
+include <time.h>
+declaration int nanosleep(const struct timespec *rqtp, \
+ struct timespec *rmtp)
+version SUNW_1.23
+errno EINTR EINVAL
+end
+
+function clock_nanosleep
+include <time.h>
+declaration int clock_nanosleep(clockid_t clock_id, int flags, \
+ const struct timespec *rqtp, struct timespec *rmtp)
+version SUNW_1.23
+errno EINTR EINVAL
+end
+
+function sched_get_priority_max
+include <sched.h>
+declaration int sched_get_priority_max(int policy)
+version SUNW_1.23
+errno EINVAL ENOSYS ESRCH
+end
+
+function sched_get_priority_min
+include <sched.h>
+declaration int sched_get_priority_min(int policy)
+version SUNW_1.23
+errno EINVAL ENOSYS ESRCH
+end
+
+function sched_rr_get_interval
+include <sched.h>
+declaration int sched_rr_get_interval(pid_t pid, struct timespec *interval)
+version SUNW_1.23
+errno EINVAL ENOSYS ESRCH
+end
+
+function sched_setparam
+include <sched.h>
+declaration int sched_setparam(pid_t pid, const struct sched_param *param)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM ESRCH
+end
+
+function sched_getparam
+include <sched.h>
+declaration int sched_getparam(pid_t pid, struct sched_param *param)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM ESRCH
+end
+
+function sched_setscheduler
+include <sched.h>
+declaration int sched_setscheduler(pid_t pid, int policy, \
+ const struct sched_param *param)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM ESRCH
+end
+
+function sched_getscheduler
+include <sched.h>
+declaration int sched_getscheduler(pid_t pid)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM ESRCH
+end
+
+function sched_yield
+include <sched.h>
+declaration int sched_yield(void)
+version SUNW_1.23
+errno ENOSYS
+end
+
+function sem_close
+include <semaphore.h>
+declaration int sem_close(sem_t *sem)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function sem_destroy
+include <semaphore.h>
+declaration int sem_destroy(sem_t *sem)
+version SUNW_1.23
+errno EINVAL ENOSYS EBUSY
+end
+
+function sem_getvalue
+include <semaphore.h>
+declaration int sem_getvalue(sem_t *sem, int *sval)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function sem_init
+include <semaphore.h>, <unistd.h>
+declaration int sem_init(sem_t *sem, int pshared, unsigned int value)
+version SUNW_1.23
+errno EINVAL ENOSPC ENOSYS EPERM
+end
+
+function sem_open
+include <semaphore.h>, <unistd.h>, <sys/stat.h>
+declaration sem_t *sem_open(const char *name, int oflag, ...)
+version SUNW_1.23
+errno EACCES EEXIST EINTR EINVAL EMFILE ENAMETOOLONG ENFILE \
+ ENOENT ENOSPC ENOSYS
+end
+
+function sem_post
+include <semaphore.h>
+declaration int sem_post(sem_t *sem)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function sem_unlink
+include <semaphore.h>
+declaration int sem_unlink(const char *name)
+version SUNW_1.23
+errno EACCES ENAMETOOLONG ENOENT ENOSYS
+end
+
+function sem_wait
+include <semaphore.h>
+declaration int sem_wait(sem_t *sem)
+version SUNW_1.23
+errno EAGAIN EINVAL EINTR ENOSYS EDEADLK
+end
+
+function sem_timedwait
+include <semaphore.h> <time.h>
+declaration int sem_timedwait(sem_t *sem, const timespec_t *abstime)
+version SUNW_1.23
+errno EAGAIN EINVAL EINTR ETIMEDOUT EDEADLK
+end
+
+function sem_reltimedwait_np
+include <semaphore.h> <time.h>
+declaration int sem_reltimedwait_np(sem_t *sem, const timespec_t *reltime)
+version SUNW_1.23
+errno EAGAIN EINVAL EINTR ETIMEDOUT EDEADLK
+end
+
+function sem_trywait
+include <semaphore.h>
+declaration int sem_trywait(sem_t *sem)
+version SUNW_1.23
+errno EAGAIN EINVAL EINTR ENOSYS EDEADLK
+end
+
+function shm_open
+include <sys/mman.h>, <sys/types.h>, <sys/stat.h>, <fcntl.h>
+declaration int shm_open(const char *name, int oflag, mode_t mode)
+version SUNW_1.23
+errno EACCES EEXIST EINTR EINVAL EMFILE ENAMETOOLONG ENFILE \
+ ENOENT ENOSPC ENOSYS
+end
+
+function shm_unlink
+declaration int shm_unlink(const char *name)
+version SUNW_1.23
+errno EACCES ENAMETOOLONG ENOENT ENOSYS
+end
+
+function sigqueue
+include <signal.h>
+declaration int sigqueue(pid_t pid, int signo, const union sigval value)
+version SUNW_1.23
+errno EAGAIN EINVAL ENOSYS EPERM ESRCH
+end
+
+function sigwaitinfo
+include <signal.h>
+declaration int sigwaitinfo(const sigset_t *_RESTRICT_KYWD set, \
+ siginfo_t *_RESTRICT_KYWD info)
+version SUNW_1.23
+errno EINTR ENOSYS EAGAIN EINVAL
+end
+
+function sigtimedwait
+include <signal.h>
+declaration int sigtimedwait(const sigset_t *_RESTRICT_KYWD set, \
+ siginfo_t *_RESTRICT_KYWD info, \
+ const struct timespec *_RESTRICT_KYWD timeout)
+version SUNW_1.23
+errno EINTR ENOSYS EAGAIN EINVAL
+end
+
+function timer_create
+include <signal.h>, <time.h>
+declaration int timer_create(clockid_t clock_id, struct sigevent *evp, \
+ timer_t *timerid)
+version SUNW_1.23
+errno EAGAIN EINVAL ENOSYS
+end
+
+function timer_delete
+include <time.h>
+declaration int timer_delete(timer_t timerid)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function timer_settime
+include <time.h>
+declaration int timer_settime(timer_t timerid, int flags, \
+ const struct itimerspec *value, \
+ struct itimerspec *ovalue)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function timer_gettime
+include <time.h>
+declaration int timer_gettime(timer_t timerid, struct itimerspec *value)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function timer_getoverrun
+include <time.h>
+declaration int timer_getoverrun(timer_t timerid)
+version SUNW_1.23
+errno EINVAL ENOSYS
+end
+
+function clock_settime
+include <time.h>
+declaration int clock_settime(clockid_t clock_id, const struct timespec *tp)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM
+end
+
+function clock_gettime
+include <time.h>
+declaration int clock_gettime(clockid_t clock_id, struct timespec *tp)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM
+end
+
+function clock_getres
+include <time.h>
+declaration int clock_getres(clockid_t clock_id, struct timespec *res)
+version SUNW_1.23
+errno EINVAL ENOSYS EPERM
+end
+
+function _clock_getres
+version SUNWprivate_1.1
+end
+
+function _clock_gettime
+version SUNWprivate_1.1
+end
+
+function _clock_settime
+version SUNWprivate_1.1
+end
+
+function _nanosleep
+version SUNWprivate_1.1
+end
+
+function _clock_nanosleep
+version SUNWprivate_1.1
+end
+
+function _timer_create
+version SUNWprivate_1.1
+end
+
+function _timer_delete
+version SUNWprivate_1.1
+end
+
+function _timer_getoverrun
+version SUNWprivate_1.1
+end
+
+function _timer_gettime
+version SUNWprivate_1.1
+end
+
+function _timer_settime
+version SUNWprivate_1.1
+end
+
+#
+# Weak Specs
+#
+function _sem_open
+weak sem_open
+version SUNWprivate_1.1
+end
+
+function _sem_close
+weak sem_close
+version SUNWprivate_1.1
+end
+
+function _sem_unlink
+weak sem_unlink
+version SUNWprivate_1.1
+end
+
+function _sem_init
+weak sem_init
+version SUNWprivate_1.1
+end
+
+function _sem_destroy
+weak sem_destroy
+version SUNWprivate_1.1
+end
+
+function _sem_wait
+weak sem_wait
+version SUNWprivate_1.1
+end
+
+function _sem_timedwait
+weak sem_timedwait
+version SUNWprivate_1.1
+end
+
+function _sem_reltimedwait_np
+weak sem_reltimedwait_np
+version SUNWprivate_1.1
+end
+
+function _sem_trywait
+weak sem_trywait
+version SUNWprivate_1.1
+end
+
+function _sem_post
+weak sem_post
+version SUNWprivate_1.1
+end
+
+function _sem_getvalue
+weak sem_getvalue
+version SUNWprivate_1.1
+end
+
+function _sigwaitinfo
+weak sigwaitinfo
+version SUNWprivate_1.1
+end
+
+function _sigtimedwait
+weak sigtimedwait
+version SUNWprivate_1.1
+end
+
+function _sigqueue
+weak sigqueue
+version SUNWprivate_1.1
+end
+
diff --git a/usr/src/lib/libc/spec/sys.spec b/usr/src/lib/libc/spec/sys.spec
index e780453a1d..89aa86beb9 100644
--- a/usr/src/lib/libc/spec/sys.spec
+++ b/usr/src/lib/libc/spec/sys.spec
@@ -1,3 +1,4 @@
+#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
@@ -17,6 +18,7 @@
#
# CDDL HEADER END
#
+#
# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -2377,11 +2379,6 @@ version sparc=SYSVABI_1.3 i386=SYSVABI_1.3 sparcv9=SUNW_0.7 \
binding nodirect
end
-function _libc_sigaction
-weak sigaction
-version SUNWprivate_1.1
-end
-
function sigaltstack
include <signal.h>
declaration int sigaltstack(const stack_t *_RESTRICT_KYWD ss, \
diff --git a/usr/src/lib/libc/spec/threads.spec b/usr/src/lib/libc/spec/threads.spec
index 1bd84cfbeb..21e22d308a 100644
--- a/usr/src/lib/libc/spec/threads.spec
+++ b/usr/src/lib/libc/spec/threads.spec
@@ -1,4 +1,6 @@
#
+# CDDL HEADER START
+#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
@@ -951,14 +953,6 @@ arch i386
version i386=SUNWprivate_1.1
end
-function _cancel_prologue
-version SUNWprivate_1.1
-end
-
-function _cancel_epilogue
-version SUNWprivate_1.1
-end
-
function _sigoff
version SUNWprivate_1.1
end
@@ -967,10 +961,6 @@ function _sigon
version SUNWprivate_1.1
end
-function _sigdeferred
-version SUNWprivate_1.1
-end
-
function _thr_detach
version SUNWprivate_1.1
end