diff options
Diffstat (limited to 'usr/src/lib/libc')
54 files changed, 10626 insertions, 418 deletions
diff --git a/usr/src/lib/libc/Makefile.targ b/usr/src/lib/libc/Makefile.targ index e3bb69581a..26e8812bd7 100644 --- a/usr/src/lib/libc/Makefile.targ +++ b/usr/src/lib/libc/Makefile.targ @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -200,6 +199,21 @@ pics/%_c89.o: $(LIBCBASE)/../port/stdio/%.c $(COMPILE.c) -o $@ $< $(POST_PROCESS_O) +# aio rules +pics/%.o: $(LIBCBASE)/../port/aio/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + +# rt rules +pics/%.o: $(LIBCBASE)/../port/rt/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + +# tpool rules +pics/%.o: $(LIBCBASE)/../port/tpool/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + # threads rules pics/%.o: $(LIBCBASE)/../port/threads/%.c $(COMPILE.c) -o $@ $< diff --git a/usr/src/lib/libc/README b/usr/src/lib/libc/README index f3c5ab146d..289f766aef 100644 --- a/usr/src/lib/libc/README +++ b/usr/src/lib/libc/README @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -64,9 +63,12 @@ fork-safe) and in which the calling thread has all signals deferred However, certain rules apply to the code within these critical regions: - - The code must be of guaranteed short duration; no - calls to interfaces that might block indefinitely are - allowed. This means no calls into stdio or syslog(). + - The code must be of guaranteed short duration; no calls + to interfaces that might block indefinitely are allowed. + This means no calls into stdio or syslog() and no calls + to cond_wait() unless there is a guarantee of an almost- + immediate call to cond_signal() or cond_broadcast() + from elsewhere. - The code cannot call any non-l* synchronization primitives (mutex_lock(), _private_mutex_lock(), @@ -197,3 +199,40 @@ conditions such as the setting of CFLAGS and CPPFLAGS for the libc_i18n stuff need to be compatible with the ones for the libc stuff. Whenever changes that affect the compilation conditions of libc occur, the changes should be propagated to libc_i18n. + +----- + +The putback of the project: + 6416832 libaio and librt can and should be folded into libc +introduced several libc-private locking interfaces: + void sig_mutex_lock(mutex_t *); + void sig_mutex_unlock(mutex_t *); + int sig_mutex_trylock(mutex_t *); + int sig_cond_wait(cond_t *, mutex_t *); + int sig_cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *); +which are declared in both "thr_uberdata.h" and "mtlib.h". + +They are used in specialized code in libc, like the asynchronous i/o code. +Unlike the lmutex_lock() and lmutex_unlock() interfaces described above, +these interfaces do not define critical regions, but signals are +deferred while locks acquired by these functions are held, making +their use be async-signal safe. Calls to malloc(), calloc(), realloc(), +and free() are permissible while holding such locks. + +These interfaces were brought over from code in the former libaio +and librt and are necessary because, where they are used, the code +must execute potentially long-term waits and must be cancelable. +sig_cond_wait() and sig_cond_reltimedwait() are cancellation points. + +These interfaces are available for other uses inside libc, as +the need arises. (There is no need if the code does not perform +long-term waits.) Just follow a few rules to be self-consistent: + - Don't mix calls to mutex_[un]lock(), lmutex_[un]lock() and + sig_mutex_[un]lock() on the same mutex. + - Don't call cond_wait() with a mutex acquired by sig_mutex_lock(); + call sig_cond_wait() or sig_cond_reltimedwait(). + - Use pthread_cleanup_push() and pthread_cleanup_pop() to make + your code cancellation-safe. + - The sig_*() interfaces are not in themselves fork-safe. + You have to employ other logic to make your code fork-safe. + See the tail of postfork1_child() for examples. diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile index 0238a550ed..4db5f28fcb 100644 --- a/usr/src/lib/libc/amd64/Makefile +++ b/usr/src/lib/libc/amd64/Makefile @@ -711,6 +711,24 @@ PORTI18N_COND= \ wcstol_longlong.o \ wcstoul_longlong.o +AIOOBJS= \ + aio.o \ + aio_alloc.o \ + posix_aio.o \ + +RTOBJS= \ + clock_timer.o \ + fallocate.o \ + mqueue.o \ + pos4obj.o \ + sched.o \ + sem.o \ + shm.o \ + sigev_thread.o + +TPOOLOBJS= \ + thread_pool.o + THREADSOBJS= \ alloc.o \ assfail.o \ @@ -836,6 +854,9 @@ MOSTOBJS= \ $(PORTSTDIO_W) \ $(PORTSYS) \ $(PORTSYS64) \ + $(AIOOBJS) \ + $(RTOBJS) \ + $(TPOOLOBJS) \ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ @@ -939,6 +960,9 @@ SRCS= \ $(PORTREGEX:%.o=../port/regex/%.c) \ $(PORTSTDIO:%.o=../port/stdio/%.c) \ $(PORTSYS:%.o=../port/sys/%.c) \ + $(AIOOBJS:%.o=../port/aio/%.c) \ + $(RTOBJS:%.o=../port/rt/%.c) \ + $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=threads/%.c) \ $(UNWINDMACHOBJS:%.o=unwind/%.c) \ @@ -966,6 +990,7 @@ $(MAPFILE): # Files which need the threads .il inline template TIL= \ + aio.o \ alloc.o \ assfail.o \ atexit.o \ @@ -974,7 +999,9 @@ TIL= \ door_calls.o \ errno.o \ lwp.o \ + ma.o \ machdep.o \ + posix_aio.o \ pthr_attr.o \ pthr_barrier.o \ pthr_cond.o \ @@ -987,12 +1014,14 @@ TIL= \ scalls.o \ sema.o \ sigaction.o \ + sigev_thread.o \ spawn.o \ stack.o \ synch.o \ tdb_agent.o \ thr.o \ thread_interface.o \ + thread_pool.o \ thrp_unwind.o \ tls.o \ tsd.o diff --git a/usr/src/lib/libc/amd64/gen/siglongjmp.c b/usr/src/lib/libc/amd64/gen/siglongjmp.c index fd9860aad8..4bc4c579a4 100644 --- a/usr/src/lib/libc/amd64/gen/siglongjmp.c +++ b/usr/src/lib/libc/amd64/gen/siglongjmp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,17 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" +#pragma weak siglongjmp = _siglongjmp + +#include "synonyms.h" #include <sys/types.h> #include <sys/ucontext.h> #include <setjmp.h> @@ -33,8 +36,6 @@ extern int _setcontext(const ucontext_t *); -#pragma weak siglongjmp = _siglongjmp - void _siglongjmp(sigjmp_buf env, int val) { diff --git a/usr/src/lib/libc/common/sys/__clock_timer.s b/usr/src/lib/libc/common/sys/__clock_timer.s index 4c4e917836..5188262570 100644 --- a/usr/src/lib/libc/common/sys/__clock_timer.s +++ b/usr/src/lib/libc/common/sys/__clock_timer.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -110,11 +110,11 @@ /* * int - * ___nanosleep(const timespec_t *rqtp, timespec_t *rmtp) + * __nanosleep(const timespec_t *rqtp, timespec_t *rmtp) */ - ENTRY(___nanosleep) + ENTRY(__nanosleep) SYSTRAP_RVAL1(nanosleep) SYSLWPERR RET - SET_SIZE(___nanosleep) + SET_SIZE(__nanosleep) diff --git a/usr/src/lib/libc/common/sys/__signotify.s b/usr/src/lib/libc/common/sys/__signotify.s index f49d5eb297..057a00ad45 100644 --- a/usr/src/lib/libc/common/sys/__signotify.s +++ b/usr/src/lib/libc/common/sys/__signotify.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -28,9 +28,8 @@ .file "%M%" -/* unpublished system call for librt -- __signotify */ -/* int _signotify (int cmd, siginfo_t *siginfo, */ -/* signotify_id_t *sn_id); */ +/* unpublished system call for POSIX message queues -- __signotify */ +/* int __signotify (int cmd, siginfo_t *siginfo, signotify_id_t *sn_id); */ #include "SYS.h" diff --git a/usr/src/lib/libc/common/sys/__sigrt.s b/usr/src/lib/libc/common/sys/__sigrt.s index df1154abd0..0ce63adb4e 100644 --- a/usr/src/lib/libc/common/sys/__sigrt.s +++ b/usr/src/lib/libc/common/sys/__sigrt.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,7 +32,7 @@ /* * int - * __sigqueue(pid_t pid, int signo, void *value, int si_code) + * __sigqueue(pid_t pid, int signo, void *value, int si_code, int block) */ SYSCALL2_RVAL1(__sigqueue,sigqueue) RETC @@ -40,9 +40,9 @@ /* * int - * ___sigtimedwait(const sigset_t *set, siginfo_t *info, + * __sigtimedwait(const sigset_t *set, siginfo_t *info, * const timespec_t *timeout) */ - SYSCALL2_RVAL1(___sigtimedwait,sigtimedwait) + SYSCALL2_RVAL1(__sigtimedwait,sigtimedwait) RET - SET_SIZE(___sigtimedwait) + SET_SIZE(__sigtimedwait) diff --git a/usr/src/lib/libc/common/sys/kaio.s b/usr/src/lib/libc/common/sys/kaio.s index cb75d3e2d5..1cd3810403 100644 --- a/usr/src/lib/libc/common/sys/kaio.s +++ b/usr/src/lib/libc/common/sys/kaio.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,12 +18,12 @@ * * CDDL HEADER END */ + /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ - /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,14 +32,12 @@ .file "%M%" /* C library -- kaio */ -/* intptr_t kaio (); */ - -#include <sys/asm_linkage.h> - - ANSI_PRAGMA_WEAK(kaio,function) +/* intptr_t _kaio (); */ #include "SYS.h" - SYSCALL_RVAL1(kaio) + ENTRY(_kaio) + SYSTRAP_RVAL1(kaio) + SYSCERROR RET - SET_SIZE(kaio) + SET_SIZE(_kaio) diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com index 4c40fc780e..8613ab3972 100644 --- a/usr/src/lib/libc/i386/Makefile.com +++ b/usr/src/lib/libc/i386/Makefile.com @@ -751,6 +751,24 @@ PORTI18N_COND= \ wcstol_longlong.o \ wcstoul_longlong.o +AIOOBJS= \ + aio.o \ + aio_alloc.o \ + posix_aio.o \ + +RTOBJS= \ + clock_timer.o \ + fallocate.o \ + mqueue.o \ + pos4obj.o \ + sched.o \ + sem.o \ + shm.o \ + sigev_thread.o + +TPOOLOBJS= \ + thread_pool.o + THREADSOBJS= \ alloc.o \ assfail.o \ @@ -871,6 +889,9 @@ MOSTOBJS= \ $(PORTSTDIO_W) \ $(PORTSYS) \ $(PORTSYS64) \ + $(AIOOBJS) \ + $(RTOBJS) \ + $(TPOOLOBJS) \ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ @@ -988,6 +1009,9 @@ SRCS= \ $(PORTREGEX:%.o=../port/regex/%.c) \ $(PORTSTDIO:%.o=../port/stdio/%.c) \ $(PORTSYS:%.o=../port/sys/%.c) \ + $(AIOOBJS:%.o=../port/aio/%.c) \ + $(RTOBJS:%.o=../port/rt/%.c) \ + $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \ $(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \ @@ -1016,6 +1040,7 @@ $(MAPFILE): # Files which need the threads .il inline template TIL= \ + aio.o \ alloc.o \ assfail.o \ atexit.o \ @@ -1024,7 +1049,9 @@ TIL= \ door_calls.o \ errno.o \ lwp.o \ + ma.o \ machdep.o \ + posix_aio.o \ pthr_attr.o \ pthr_barrier.o \ pthr_cond.o \ @@ -1037,12 +1064,14 @@ TIL= \ scalls.o \ sema.o \ sigaction.o \ + sigev_thread.o \ spawn.o \ stack.o \ synch.o \ tdb_agent.o \ thr.o \ thread_interface.o \ + thread_pool.o \ tls.o \ tsd.o \ unwind.o diff --git a/usr/src/lib/libc/i386/gen/siglongjmp.c b/usr/src/lib/libc/i386/gen/siglongjmp.c index ff40ea8f98..1b3296d93d 100644 --- a/usr/src/lib/libc/i386/gen/siglongjmp.c +++ b/usr/src/lib/libc/i386/gen/siglongjmp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,9 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#pragma weak siglongjmp = _siglongjmp + +#include "synonyms.h" #include <sys/types.h> #include <sys/ucontext.h> #include <setjmp.h> @@ -37,8 +40,6 @@ extern int _setcontext(const ucontext_t *); -#pragma weak siglongjmp = _siglongjmp - void _siglongjmp(sigjmp_buf env, int val) { diff --git a/usr/src/lib/libc/inc/asyncio.h b/usr/src/lib/libc/inc/asyncio.h new file mode 100644 index 0000000000..02d33cd700 --- /dev/null +++ b/usr/src/lib/libc/inc/asyncio.h @@ -0,0 +1,346 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ASYNCIO_H +#define _ASYNCIO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <thread.h> +#include <pthread.h> +#include <setjmp.h> +#include <signal.h> +#include <siginfo.h> +#include <aio.h> +#include <limits.h> +#include <ucontext.h> +#include <sys/asynch.h> +#include <sys/mman.h> + +#if !defined(_LP64) +#define AIOSTKSIZE (64 * 1024) +#else +#define AIOSTKSIZE (128 * 1024) +#endif + +#define SIGAIOCANCEL SIGLWP /* special aio cancelation signal */ + +#define AIO_WAITN_MAXIOCBS 32768 /* max. iocbs per system call */ + +/* + * Declare structure types. The structures themselves are defined below. + */ +typedef struct aio_args aio_args_t; +typedef struct aio_lio aio_lio_t; +typedef struct notif_param notif_param_t; +typedef struct aio_req aio_req_t; +typedef struct aio_worker aio_worker_t; +typedef struct aio_hash aio_hash_t; + +struct aio_args { + int fd; + caddr_t buf; + size_t bufsz; + offset_t offset; +}; + +/* + * list head for UFS list I/O + */ +struct aio_lio { + mutex_t lio_mutex; /* list mutex */ + cond_t lio_cond_cv; /* list notification for I/O done */ + aio_lio_t *lio_next; /* pointer to next on freelist */ + char lio_mode; /* LIO_WAIT/LIO_NOWAIT */ + char lio_canned; /* lio was canceled */ + char lio_largefile; /* largefile operation */ + char lio_waiting; /* waiting in __lio_listio() */ + int lio_nent; /* Number of list I/O's */ + int lio_refcnt; /* outstanding I/O's */ + int lio_event; /* Event number for notification */ + int lio_port; /* Port number for notification */ + int lio_signo; /* Signal number for notification */ + union sigval lio_sigval; /* Signal parameter */ + uintptr_t lio_object; /* for SIGEV_THREAD or SIGEV_PORT */ + struct sigevent *lio_sigevent; /* Notification function and attr. */ +}; + +/* + * Notification parameters + */ +struct notif_param { + int np_signo; /* SIGEV_SIGNAL */ + int np_port; /* SIGEV_THREAD or SIGEV_PORT */ + void *np_user; + int np_event; + uintptr_t np_object; + int np_lio_signo; /* listio: SIGEV_SIGNAL */ + int np_lio_port; /* listio: SIGEV_THREAD or SIGEV_PORT */ + void *np_lio_user; + int np_lio_event; + uintptr_t np_lio_object; +}; + +struct aio_req { + /* + * fields protected by _aio_mutex lock. + */ + aio_req_t *req_link; /* hash/freelist chain link */ + /* + * when req is on the doneq, then req_next is protected by + * the _aio_mutex lock. when the req is on a work q, then + * req_next is protected by a worker's work_qlock1 lock. + */ + aio_req_t *req_next; /* request/done queue link */ + aio_req_t *req_prev; /* double linked list */ + /* + * fields protected by a worker's work_qlock1 lock. + */ + char req_state; /* AIO_REQ_QUEUED, ... */ + /* + * fields require no locking. + */ + char req_type; /* AIO_POSIX_REQ or not */ + char req_largefile; /* largefile operation */ + char req_op; /* AIOREAD, etc. */ + aio_worker_t *req_worker; /* associate request with worker */ + aio_result_t *req_resultp; /* address of result buffer */ + aio_args_t req_args; /* arglist */ + aio_lio_t *req_head; /* list head for LIO */ + struct sigevent req_sigevent; + void *req_aiocbp; /* ptr to aiocb or aiocb64 */ + notif_param_t req_notify; /* notification parameters */ +}; + +/* special lio type that destroys itself when lio refcnt becomes zero */ +#define LIO_FSYNC LIO_WAIT+1 +#define LIO_DESTROY LIO_FSYNC+1 + +/* lio flags */ +#define LIO_FSYNC_CANCELED 0x1 + +/* values for aio_state */ + +#define AIO_REQ_QUEUED 1 +#define AIO_REQ_INPROGRESS 2 +#define AIO_REQ_CANCELED 3 +#define AIO_REQ_DONE 4 +#define AIO_REQ_FREE 5 +#define AIO_REQ_DONEQ 6 + +/* use KAIO in _aio_rw() */ +#define AIO_NO_KAIO 0x0 +#define AIO_KAIO 0x1 +#define AIO_NO_DUPS 0x2 + +#define AIO_POSIX_REQ 0x1 + +#define CHECK 1 +#define NOCHECK 2 +#define CHECKED 3 +#define USERAIO 4 +#define USERAIO_DONE 5 + +/* values for _aio_flags */ + +/* if set, _aiodone() notifies aio_waitn about done requests */ +#define AIO_WAIT_INPROGRESS 0x1 +/* if set, _aiodone() wakes up functions waiting for completed I/Os */ +#define AIO_IO_WAITING 0x2 +#define AIO_LIB_WAITN 0x4 /* aio_waitn in progress */ +#define AIO_LIB_WAITN_PENDING 0x8 /* aio_waitn requests pending */ + +/* + * Before a kaio() system call, the fd will be checked + * to ensure that kernel async. I/O is supported for this file. + * The only way to find out is if a kaio() call returns ENOTSUP, + * so the default will always be to try the kaio() call. Only in + * the specific instance of a kaio() call returning ENOTSUP + * will we stop submitting kaio() calls for that fd. + * If the fd is outside the array bounds, we will allow the kaio() + * call. + * + * The only way that an fd entry can go from ENOTSUP to supported + * is if that fd is freed up by a close(), and close will clear + * the entry for that fd. + * + * Each fd gets a bit in the array _kaio_supported[]. + * + * uint32_t _kaio_supported[MAX_KAIO_FDARRAY_SIZE]; + * + * Array is MAX_KAIO_ARRAY_SIZE of 32-bit elements, for 8kb. + * If more than (MAX_KAIO_FDARRAY_SIZE * KAIO_FDARRAY_ELEM_SIZE) + * files are open, this can be expanded. + */ + +#define MAX_KAIO_FDARRAY_SIZE 2048 +#define KAIO_FDARRAY_ELEM_SIZE WORD_BIT /* uint32_t */ + +#define MAX_KAIO_FDS (MAX_KAIO_FDARRAY_SIZE * KAIO_FDARRAY_ELEM_SIZE) + +#define VALID_FD(fdes) ((fdes) >= 0 && (fdes) < MAX_KAIO_FDS) + +#define KAIO_SUPPORTED(fdes) \ + (!VALID_FD(fdes) || \ + ((_kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] & \ + (uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))) == 0)) + +#define SET_KAIO_NOT_SUPPORTED(fdes) \ + if (VALID_FD(fdes)) \ + _kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] |= \ + (uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE)) + +#define CLEAR_KAIO_SUPPORTED(fdes) \ + if (VALID_FD(fdes)) \ + _kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] &= \ + ~(uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE)) + +struct aio_worker { + aio_worker_t *work_forw; /* forward link in list of workers */ + aio_worker_t *work_backw; /* backwards link in list of workers */ + mutex_t work_qlock1; /* lock for work queue 1 */ + cond_t work_idle_cv; /* place to sleep when idle */ + aio_req_t *work_head1; /* head of work request queue 1 */ + aio_req_t *work_tail1; /* tail of work request queue 1 */ + aio_req_t *work_next1; /* work queue one's next pointer */ + aio_req_t *work_prev1; /* last request done from queue 1 */ + aio_req_t *work_req; /* active work request */ + thread_t work_tid; /* worker's thread-id */ + int work_count1; /* length of work queue one */ + int work_done1; /* number of requests done */ + int work_minload1; /* min length of queue */ + int work_idleflg; /* when set, worker is idle */ + sigjmp_buf work_jmp_buf; /* cancellation point */ +}; + +struct aio_hash { /* resultp hash table */ + mutex_t hash_lock; + aio_req_t *hash_ptr; +#if !defined(_LP64) + void *hash_pad; /* ensure sizeof (aio_hash_t) == 32 */ +#endif +}; + +extern aio_hash_t *_aio_hash; + +#define HASHSZ 2048 /* power of 2 */ +#define AIOHASH(resultp) ((((uintptr_t)(resultp) >> 17) ^ \ + ((uintptr_t)(resultp) >> 2)) & (HASHSZ - 1)) +#define POSIX_AIO(x) ((x)->req_type == AIO_POSIX_REQ) + +extern int __uaio_init(void); +extern void _kaio_init(void); +extern intptr_t _kaio(int, ...); +extern int _aiorw(int, caddr_t, int, offset_t, int, aio_result_t *, int); +extern int _aio_rw(aiocb_t *, aio_lio_t *, aio_worker_t **, int, int); +#if !defined(_LP64) +extern int _aio_rw64(aiocb64_t *, aio_lio_t *, aio_worker_t **, int, int); +#endif +extern int _aio_create_worker(aio_req_t *, int); +extern int _aio_cancel_req(aio_worker_t *, aio_req_t *, int *, int *); +extern int aiocancel_all(int); +extern void aio_panic(const char *); +extern aio_req_t *_aio_hash_find(aio_result_t *); +extern aio_req_t *_aio_hash_del(aio_result_t *); +extern void _aio_req_mark_done(aio_req_t *); +extern void _aio_waitn_wakeup(void); +extern aio_worker_t *_aio_worker_alloc(void); +extern void _aio_worker_free(void *); +extern aio_req_t *_aio_req_alloc(void); +extern void _aio_req_free(aio_req_t *); +extern aio_lio_t *_aio_lio_alloc(void); +extern void _aio_lio_free(aio_lio_t *); +extern int _aio_idle(aio_worker_t *); +extern void *_aio_do_request(void *); +extern void *_aio_do_notify(void *); +extern void _lio_remove(aio_req_t *); +extern aio_req_t *_aio_req_remove(aio_req_t *); +extern int _aio_get_timedelta(timespec_t *, timespec_t *); +extern aio_result_t *_aio_req_done(void); +extern void _aio_set_result(aio_req_t *, ssize_t, int); +extern int _aio_sigev_thread_init(struct sigevent *); +extern int _aio_sigev_thread(aiocb_t *); +#if !defined(_LP64) +extern int _aio_sigev_thread64(aiocb64_t *); +#endif + +extern aio_worker_t *_kaiowp; /* points to kaio cleanup thread */ +extern aio_worker_t *__workers_rw; /* list of all rw workers */ +extern aio_worker_t *__nextworker_rw; /* worker chosen for next rw request */ +extern int __rw_workerscnt; /* number of rw workers */ +extern aio_worker_t *__workers_no; /* list of all notification workers */ +extern aio_worker_t *__nextworker_no; /* worker chosen, next notification */ +extern int __no_workerscnt; /* number of notification workers */ +extern mutex_t __aio_initlock; /* makes aio initialization atomic */ +extern cond_t __aio_initcv; +extern int __aio_initbusy; +extern mutex_t __aio_mutex; /* global aio lock */ +extern cond_t _aio_iowait_cv; /* wait for userland I/Os */ +extern cond_t _aio_waitn_cv; /* wait for end of aio_waitn */ +extern int _max_workers; /* max number of workers permitted */ +extern int _min_workers; /* min number of workers */ +extern sigset_t _worker_set; /* worker's signal mask */ +extern int _aio_worker_cnt; /* number of AIO workers */ +extern int _sigio_enabled; /* when set, send SIGIO signal */ +extern pid_t __pid; /* process's PID */ +extern int __uaio_ok; /* indicates if aio is initialized */ +extern int _kaio_ok; /* indicates if kaio is initialized */ +extern pthread_key_t _aio_key; /* for thread-specific data */ +extern aio_req_t *_aio_done_tail; /* list of done requests */ +extern aio_req_t *_aio_done_head; +extern aio_req_t *_aio_doneq; +extern int _aio_freelist_cnt; +extern int _aio_allocated_cnt; +extern int _aio_donecnt; +extern int _aio_doneq_cnt; +extern int _aio_waitncnt; /* # of requests for aio_waitn */ +extern int _aio_outstand_cnt; /* # of outstanding requests */ +extern int _kaio_outstand_cnt; /* # of outstanding kaio requests */ +extern int _aio_req_done_cnt; /* req. done but not in "done queue" */ +extern int _aio_kernel_suspend; /* active kernel kaio calls */ +extern int _aio_suscv_cnt; /* aio_suspend calls waiting on cv's */ +extern int _aiowait_flag; /* when set, aiowait() is inprogress */ +extern int _aio_flags; /* see defines, above */ +extern uint32_t *_kaio_supported; + +extern const sigset_t maskset; /* all maskable signals */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ASYNCIO_H */ diff --git a/usr/src/lib/libc/inc/mtlib.h b/usr/src/lib/libc/inc/mtlib.h index 89c2376949..d864e8e75a 100644 --- a/usr/src/lib/libc/inc/mtlib.h +++ b/usr/src/lib/libc/inc/mtlib.h @@ -57,6 +57,15 @@ extern int __rw_unlock(rwlock_t *); extern void lrw_rdlock(rwlock_t *); extern void lrw_wrlock(rwlock_t *); extern void lrw_unlock(rwlock_t *); +extern void sig_mutex_lock(mutex_t *); +extern void sig_mutex_unlock(mutex_t *); +extern int sig_mutex_trylock(mutex_t *); +extern int sig_cond_wait(cond_t *, mutex_t *); +extern int sig_cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *); + +/* the private libc thread-safe allocator */ +extern void *lmalloc(size_t); +extern void lfree(void *, size_t); /* the rest are public functions */ extern int _mutex_init(mutex_t *, int, void *); @@ -91,6 +100,8 @@ extern thread_t _thr_self(void); extern void _thr_exit(void *); extern size_t _thr_min_stack(void); extern int _thr_kill(thread_t, int); +extern int _thr_create(void *, size_t, void *(*)(void *), void *, long, + thread_t *); extern int _thr_keycreate(thread_key_t *, void (*)(void *)); extern int _thr_setspecific(thread_key_t, void *); extern int _thr_getspecific(thread_key_t, void **); diff --git a/usr/src/lib/libc/inc/rtsched.h b/usr/src/lib/libc/inc/rtsched.h new file mode 100644 index 0000000000..90ae11c3b2 --- /dev/null +++ b/usr/src/lib/libc/inc/rtsched.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _RTSCHED_H +#define _RTSCHED_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/priocntl.h> + +/* + * This definition is private to libc but is used in more than one subsystem. + */ +struct pcclass { + short pcc_state; + pri_t pcc_primin; + pri_t pcc_primax; + pcinfo_t pcc_info; +}; + +#endif /* _RTSCHED_H */ diff --git a/usr/src/lib/libc/inc/synonyms.h b/usr/src/lib/libc/inc/synonyms.h index 179f25f627..4de926dc9f 100644 --- a/usr/src/lib/libc/inc/synonyms.h +++ b/usr/src/lib/libc/inc/synonyms.h @@ -223,6 +223,10 @@ extern "C" { #define chown _chown #define chroot _chroot #define _cladm __cladm +#define clock_getres _clock_getres +#define clock_gettime _clock_gettime +#define clock_nanosleep _clock_nanosleep +#define clock_settime _clock_settime #define close _close #define closedir _closedir #define closefrom _closefrom @@ -264,8 +268,8 @@ extern "C" { #define decimal_to_single _decimal_to_single #define dgettext _dgettext #define dirname _dirname -#define dladdr _dladdr #define dladdr1 _dladdr1 +#define dladdr _dladdr #define dlamd64getunwind _dlamd64getunwind #define dlclose _dlclose #define dldump _dldump @@ -495,7 +499,6 @@ extern "C" { #define iswupper _iswupper #define iswxdigit _iswxdigit #define jrand48 _jrand48 -#define kaio _kaio #define kill _kill #define l64a _l64a #define ladd _ladd @@ -590,12 +593,19 @@ extern "C" { #define munlockall _munlockall #define munlock _munlock #define munmap _munmap -#define mutex_destroy _mutex_destroy -#define mutex_held _mutex_held -#define mutex_init _mutex_init -#define mutex_lock _mutex_lock -#define mutex_trylock _mutex_trylock -#define mutex_unlock _mutex_unlock +#define _mutex_destroy __mutex_destroy +#define mutex_destroy __mutex_destroy +#define _mutex_held __mutex_held +#define mutex_held __mutex_held +#define _mutex_init __mutex_init +#define mutex_init __mutex_init +#define _mutex_lock __mutex_lock +#define mutex_lock __mutex_lock +#define _mutex_trylock __mutex_trylock +#define mutex_trylock __mutex_trylock +#define _mutex_unlock __mutex_unlock +#define mutex_unlock __mutex_unlock +#define nanosleep _nanosleep #define nfs_getfh _nfs_getfh #define nfssvc _nfssvc #define nftw64 _nftw64 @@ -627,7 +637,6 @@ extern "C" { #define port_alert _port_alert #define port_associate _port_associate #define port_create _port_create -#define port_dispatch _port_dispatch #define port_dissociate _port_dissociate #define port_getn _port_getn #define port_get _port_get @@ -865,12 +874,23 @@ extern "C" { #define sema_timedwait _sema_timedwait #define sema_trywait _sema_trywait #define sema_wait _sema_wait +#define sem_close _sem_close #define semctl64 _semctl64 #define semctl _semctl +#define sem_destroy _sem_destroy #define semget _semget +#define sem_getvalue _sem_getvalue #define semids _semids +#define sem_init _sem_init +#define sem_open _sem_open #define semop _semop +#define sem_post _sem_post +#define sem_reltimedwait_np _sem_reltimedwait_np #define semtimedop _semtimedop +#define sem_timedwait _sem_timedwait +#define sem_trywait _sem_trywait +#define sem_unlink _sem_unlink +#define sem_wait _sem_wait #define setcontext _setcontext #define setegid _setegid #define setenv _setenv @@ -927,12 +947,16 @@ extern "C" { #define sigpause _sigpause #define sigpending _sigpending #define sigprocmask _sigprocmask +#define sigqueue _sigqueue #define sigrelse _sigrelse #define sigsendset _sigsendset #define sigsend _sigsend #define sigsetjmp _sigsetjmp #define sigset _sigset +#define sigstack _sigstack #define sigsuspend _sigsuspend +#define sigtimedwait _sigtimedwait +#define sigwaitinfo _sigwaitinfo #define sigwait _sigwait #define single_to_decimal _single_to_decimal #define s_ioctl _s_ioctl @@ -1018,6 +1042,11 @@ extern "C" { #define thr_suspend _thr_suspend #define thr_wait_mutator _thr_wait_mutator #define thr_yield _thr_yield +#define timer_create _timer_create +#define timer_delete _timer_delete +#define timer_getoverrun _timer_getoverrun +#define timer_gettime _timer_gettime +#define timer_settime _timer_settime #define times _times #define time _time #define tmpnam_r _tmpnam_r diff --git a/usr/src/lib/libc/inc/thr_debug.h b/usr/src/lib/libc/inc/thr_debug.h new file mode 100644 index 0000000000..5e8de4ef0a --- /dev/null +++ b/usr/src/lib/libc/inc/thr_debug.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _THR_DEBUG_H +#define _THR_DEBUG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#if defined(THREAD_DEBUG) + +extern void __assfail(const char *, const char *, int); +#pragma rarely_called(__assfail) +#define ASSERT(EX) (void)((EX) || (__assfail(#EX, __FILE__, __LINE__), 0)) + +#else + +#define ASSERT(EX) ((void)0) + +#endif + +#endif /* _THR_DEBUG_H */ diff --git a/usr/src/lib/libc/inc/thr_uberdata.h b/usr/src/lib/libc/inc/thr_uberdata.h index c7b6001926..2671ac0a69 100644 --- a/usr/src/lib/libc/inc/thr_uberdata.h +++ b/usr/src/lib/libc/inc/thr_uberdata.h @@ -53,12 +53,10 @@ #include <schedctl.h> #include <sys/priocntl.h> #include <thread_db.h> +#include <setjmp.h> #include "libc_int.h" #include "tdb_agent.h" - -/* belongs in <pthread.h> */ -#define PTHREAD_CREATE_DAEMON_NP 0x100 /* = THR_DAEMON */ -#define PTHREAD_CREATE_NONDAEMON_NP 0 +#include "thr_debug.h" /* * This is an implementation-specific include file for threading support. @@ -208,14 +206,6 @@ typedef union { #define PRIO_INHERIT 2 #define PRIO_DISINHERIT 3 -struct pcclass { - short pcc_state; - pri_t pcc_primin; - pri_t pcc_primax; - pcinfo_t pcc_info; -}; -extern struct pcclass ts_class, rt_class; - #define MUTEX_TRY 0 #define MUTEX_LOCK 1 @@ -608,7 +598,7 @@ typedef struct ulwp { #define MASKSET0 (FILLSET0 & ~CANTMASK0) #define MASKSET1 (FILLSET1 & ~CANTMASK1) -extern const sigset_t maskset; /* set of all maskable signals */ +extern const sigset_t maskset; /* set of all maskable signals */ extern int thread_adaptive_spin; extern uint_t thread_max_spinners; @@ -1048,7 +1038,7 @@ extern greg_t stkptr(void); /* * Implementation functions. Not visible outside of the library itself. */ -extern int ___nanosleep(const timespec_t *, timespec_t *); +extern int __nanosleep(const timespec_t *, timespec_t *); extern void getgregs(ulwp_t *, gregset_t); extern void setgregs(ulwp_t *, gregset_t); extern void thr_panic(const char *); @@ -1092,18 +1082,6 @@ extern void _flush_windows(void); #endif extern void set_curthread(void *); -#if defined(THREAD_DEBUG) - -extern void __assfail(const char *, const char *, int); -#pragma rarely_called(__assfail) -#define ASSERT(EX) (void)((EX) || (__assfail(#EX, __FILE__, __LINE__), 0)) - -#else /* THREAD_DEBUG */ - -#define ASSERT(EX) ((void)0) - -#endif /* THREAD_DEBUG */ - /* enter a critical section */ #define enter_critical(self) (self->ul_critical++) @@ -1174,21 +1152,35 @@ extern void *_thr_setup(ulwp_t *); extern void _fpinherit(ulwp_t *); extern void _lwp_start(void); extern void _lwp_terminate(void); -extern void lmutex_unlock(mutex_t *); extern void lmutex_lock(mutex_t *); +extern void lmutex_unlock(mutex_t *); +extern void sig_mutex_lock(mutex_t *); +extern void sig_mutex_unlock(mutex_t *); +extern int sig_mutex_trylock(mutex_t *); +extern int sig_cond_wait(cond_t *, mutex_t *); +extern int sig_cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *); extern void _prefork_handler(void); extern void _postfork_parent_handler(void); extern void _postfork_child_handler(void); -extern void _postfork1_child(void); +extern void postfork1_child(void); +extern void postfork1_child_aio(void); +extern void postfork1_child_sigev_aio(void); +extern void postfork1_child_sigev_mq(void); +extern void postfork1_child_sigev_timer(void); +extern void postfork1_child_tpool(void); extern int fork_lock_enter(const char *); extern void fork_lock_exit(void); extern void suspend_fork(void); extern void continue_fork(int); extern void do_sigcancel(void); -extern void init_sigcancel(void); +extern void setup_cancelsig(int); +extern void init_sigev_thread(void); +extern void init_aio(void); extern void _cancelon(void); extern void _canceloff(void); extern void _canceloff_nocancel(void); +extern void _cancel_prologue(void); +extern void _cancel_epilogue(void); extern void no_preempt(ulwp_t *); extern void preempt(ulwp_t *); extern void _thrp_unwind(void *); @@ -1249,8 +1241,18 @@ extern int __lwp_sigmask(int, const sigset_t *, sigset_t *); extern void __sighndlr(int, siginfo_t *, ucontext_t *, void (*)()); extern caddr_t __sighndlrend; #pragma unknown_control_flow(__sighndlr) +extern void _siglongjmp(sigjmp_buf, int); +extern int _pthread_setspecific(pthread_key_t, const void *); +extern void *_pthread_getspecific(pthread_key_t); extern void _pthread_exit(void *); +extern void _private_testcancel(void); + +/* belongs in <pthread.h> */ +#define PTHREAD_CREATE_DAEMON_NP 0x100 /* = THR_DAEMON */ +#define PTHREAD_CREATE_NONDAEMON_NP 0 +extern int _pthread_attr_setdaemonstate_np(pthread_attr_t *, int); +extern int _pthread_attr_getdaemonstate_np(const pthread_attr_t *, int *); /* these are private to the library */ extern int _private_mutex_init(mutex_t *, int, void *); @@ -1293,8 +1295,10 @@ extern int rw_read_is_held(rwlock_t *); extern int rw_write_is_held(rwlock_t *); extern int _thr_continue(thread_t); -extern int _thrp_create(void *, size_t, void *(*func)(void *), void *, - long, thread_t *, pri_t, int, size_t); +extern int _thr_create(void *, size_t, void *(*)(void *), void *, long, + thread_t *); +extern int _thrp_create(void *, size_t, void *(*)(void *), void *, long, + thread_t *, pri_t, int, size_t); extern int _thr_getprio(thread_t, int *); extern int _thr_getspecific(thread_key_t, void **); extern int _thr_join(thread_t, thread_t *, void **); @@ -1320,7 +1324,8 @@ extern int _thread_setschedparam_main(pthread_t, int, const struct sched_param *, int); extern int _validate_rt_prio(int, int); extern int _thrp_setlwpprio(lwpid_t, int, int); -extern pri_t _map_rtpri_to_gp(pri_t); +extern pri_t map_rtpri_to_gp(pri_t); +extern int get_info_by_policy(int); /* * System call wrappers (direct interfaces to the kernel) diff --git a/usr/src/lib/libc/inc/thread_pool.h b/usr/src/lib/libc/inc/thread_pool.h new file mode 100644 index 0000000000..200323703c --- /dev/null +++ b/usr/src/lib/libc/inc/thread_pool.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _THREAD_POOL_H_ +#define _THREAD_POOL_H_ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <thread.h> +#include <pthread.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct tpool tpool_t; /* opaque thread pool descriptor */ + +#if defined(__STDC__) + +extern tpool_t *tpool_create(uint_t min_threads, uint_t max_threads, + uint_t linger, pthread_attr_t *attr); +extern int tpool_dispatch(tpool_t *tpool, + void (*func)(void *), void *arg); +extern void tpool_destroy(tpool_t *tpool); +extern void tpool_abandon(tpool_t *tpool); +extern void tpool_wait(tpool_t *tpool); +extern void tpool_suspend(tpool_t *tpool); +extern int tpool_suspended(tpool_t *tpool); +extern void tpool_resume(tpool_t *tpool); +extern int tpool_member(tpool_t *tpool); + +#else /* Non ANSI */ + +extern tpool_t *tpool_create(); +extern int tpool_dispatch(); +extern void tpool_destroy(); +extern void tpool_abandon(); +extern void tpool_wait(); +extern void tpool_suspend(); +extern int tpool_suspended(); +extern void tpool_resume(); +extern int tpool_member(); + +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +#endif /* _THREAD_POOL_H_ */ diff --git a/usr/src/lib/libc/port/aio/aio.c b/usr/src/lib/libc/port/aio/aio.c new file mode 100644 index 0000000000..28d425d702 --- /dev/null +++ b/usr/src/lib/libc/port/aio/aio.c @@ -0,0 +1,2202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include "thr_uberdata.h" +#include "asyncio.h" +#include <atomic.h> +#include <sys/param.h> +#include <sys/file.h> +#include <sys/port.h> + +static int _aio_hash_insert(aio_result_t *, aio_req_t *); +static aio_req_t *_aio_req_get(aio_worker_t *); +static void _aio_req_add(aio_req_t *, aio_worker_t **, int); +static void _aio_req_del(aio_worker_t *, aio_req_t *, int); +static void _aio_work_done(aio_worker_t *); +static void _aio_enq_doneq(aio_req_t *); + +extern void _aio_lio_free(aio_lio_t *); + +extern int __fdsync(int, int); +extern int _port_dispatch(int, int, int, int, uintptr_t, void *); + +static int _aio_fsync_del(aio_worker_t *, aio_req_t *); +static void _aiodone(aio_req_t *, ssize_t, int); +static void _aio_cancel_work(aio_worker_t *, int, int *, int *); +static void _aio_finish_request(aio_worker_t *, ssize_t, int); + +/* + * switch for kernel async I/O + */ +int _kaio_ok = 0; /* 0 = disabled, 1 = on, -1 = error */ + +/* + * Key for thread-specific data + */ +pthread_key_t _aio_key; + +/* + * Array for determining whether or not a file supports kaio. + * Initialized in _kaio_init(). + */ +uint32_t *_kaio_supported = NULL; + +/* + * workers for read/write requests + * (__aio_mutex lock protects circular linked list of workers) + */ +aio_worker_t *__workers_rw; /* circular list of AIO workers */ +aio_worker_t *__nextworker_rw; /* next worker in list of workers */ +int __rw_workerscnt; /* number of read/write workers */ + +/* + * worker for notification requests. + */ +aio_worker_t *__workers_no; /* circular list of AIO workers */ +aio_worker_t *__nextworker_no; /* next worker in list of workers */ +int __no_workerscnt; /* number of write workers */ + +aio_req_t *_aio_done_tail; /* list of done requests */ +aio_req_t *_aio_done_head; + +mutex_t __aio_initlock = DEFAULTMUTEX; /* makes aio initialization atomic */ +cond_t __aio_initcv = DEFAULTCV; +int __aio_initbusy = 0; + +mutex_t __aio_mutex = DEFAULTMUTEX; /* protects counts, and linked lists */ +cond_t _aio_iowait_cv = DEFAULTCV; /* wait for userland I/Os */ + +pid_t __pid = (pid_t)-1; /* initialize as invalid pid */ +int _sigio_enabled = 0; /* when set, send SIGIO signal */ + +aio_hash_t *_aio_hash; + +aio_req_t *_aio_doneq; /* double linked done queue list */ + +int _aio_donecnt = 0; +int _aio_waitncnt = 0; /* # of requests for aio_waitn */ +int _aio_doneq_cnt = 0; +int _aio_outstand_cnt = 0; /* # of outstanding requests */ +int _kaio_outstand_cnt = 0; /* # of outstanding kaio requests */ +int _aio_req_done_cnt = 0; /* req. done but not in "done queue" */ +int _aio_kernel_suspend = 0; /* active kernel kaio calls */ +int _aio_suscv_cnt = 0; /* aio_suspend calls waiting on cv's */ + +int _max_workers = 256; /* max number of workers permitted */ +int _min_workers = 4; /* min number of workers */ +int _minworkload = 2; /* min number of request in q */ +int _aio_worker_cnt = 0; /* number of workers to do requests */ +int __uaio_ok = 0; /* AIO has been enabled */ +sigset_t _worker_set; /* worker's signal mask */ + +int _aiowait_flag = 0; /* when set, aiowait() is inprogress */ +int _aio_flags = 0; /* see asyncio.h defines for */ + +aio_worker_t *_kaiowp = NULL; /* points to kaio cleanup thread */ + +int hz; /* clock ticks per second */ + +static int +_kaio_supported_init(void) +{ + void *ptr; + size_t size; + + if (_kaio_supported != NULL) /* already initialized */ + return (0); + + size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, (off_t)0); + if (ptr == MAP_FAILED) + return (-1); + _kaio_supported = ptr; + return (0); +} + +/* + * The aio subsystem is initialized when an AIO request is made. + * Constants are initialized like the max number of workers that + * the subsystem can create, and the minimum number of workers + * permitted before imposing some restrictions. Also, some + * workers are created. + */ +int +__uaio_init(void) +{ + int ret = -1; + int i; + + lmutex_lock(&__aio_initlock); + while (__aio_initbusy) + (void) _cond_wait(&__aio_initcv, &__aio_initlock); + if (__uaio_ok) { /* already initialized */ + lmutex_unlock(&__aio_initlock); + return (0); + } + __aio_initbusy = 1; + lmutex_unlock(&__aio_initlock); + + hz = (int)sysconf(_SC_CLK_TCK); + __pid = getpid(); + + setup_cancelsig(SIGAIOCANCEL); + + if (_kaio_supported_init() != 0) + goto out; + + /* + * Allocate and initialize the hash table. + */ + /* LINTED pointer cast */ + _aio_hash = (aio_hash_t *)mmap(NULL, + HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, (off_t)0); + if ((void *)_aio_hash == MAP_FAILED) { + _aio_hash = NULL; + goto out; + } + for (i = 0; i < HASHSZ; i++) + (void) mutex_init(&_aio_hash[i].hash_lock, USYNC_THREAD, NULL); + + /* + * Initialize worker's signal mask to only catch SIGAIOCANCEL. + */ + (void) sigfillset(&_worker_set); + (void) sigdelset(&_worker_set, SIGAIOCANCEL); + + /* + * Create the minimum number of read/write workers. + */ + for (i = 0; i < _min_workers; i++) + (void) _aio_create_worker(NULL, AIOREAD); + + /* + * Create one worker to send asynchronous notifications. + */ + (void) _aio_create_worker(NULL, AIONOTIFY); + + ret = 0; +out: + lmutex_lock(&__aio_initlock); + if (ret == 0) + __uaio_ok = 1; + __aio_initbusy = 0; + (void) cond_broadcast(&__aio_initcv); + lmutex_unlock(&__aio_initlock); + return (ret); +} + +/* + * Called from close() before actually performing the real _close(). + */ +void +_aio_close(int fd) +{ + if (fd < 0) /* avoid cancelling everything */ + return; + /* + * Cancel all outstanding aio requests for this file descriptor. + */ + if (__uaio_ok) + (void) aiocancel_all(fd); + /* + * If we have allocated the bit array, clear the bit for this file. + * The next open may re-use this file descriptor and the new file + * may have different kaio() behaviour. + */ + if (_kaio_supported != NULL) + CLEAR_KAIO_SUPPORTED(fd); +} + +/* + * special kaio cleanup thread sits in a loop in the + * kernel waiting for pending kaio requests to complete. + */ +void * +_kaio_cleanup_thread(void *arg) +{ + if (pthread_setspecific(_aio_key, arg) != 0) + aio_panic("_kaio_cleanup_thread, pthread_setspecific()"); + (void) _kaio(AIOSTART); + return (arg); +} + +/* + * initialize kaio. + */ +void +_kaio_init() +{ + int error; + sigset_t oset; + + lmutex_lock(&__aio_initlock); + while (__aio_initbusy) + (void) _cond_wait(&__aio_initcv, &__aio_initlock); + if (_kaio_ok) { /* already initialized */ + lmutex_unlock(&__aio_initlock); + return; + } + __aio_initbusy = 1; + lmutex_unlock(&__aio_initlock); + + if (_kaio_supported_init() != 0) + error = ENOMEM; + else if ((_kaiowp = _aio_worker_alloc()) == NULL) + error = ENOMEM; + else if ((error = (int)_kaio(AIOINIT)) == 0) { + (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); + error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread, + _kaiowp, THR_DAEMON, &_kaiowp->work_tid); + (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); + } + if (error && _kaiowp != NULL) { + _aio_worker_free(_kaiowp); + _kaiowp = NULL; + } + + lmutex_lock(&__aio_initlock); + if (error) + _kaio_ok = -1; + else + _kaio_ok = 1; + __aio_initbusy = 0; + (void) cond_broadcast(&__aio_initcv); + lmutex_unlock(&__aio_initlock); +} + +int +aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence, + aio_result_t *resultp) +{ + return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD)); +} + +int +aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence, + aio_result_t *resultp) +{ + return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE)); +} + +#if !defined(_LP64) +int +aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence, + aio_result_t *resultp) +{ + return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64)); +} + +int +aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence, + aio_result_t *resultp) +{ + return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64)); +} +#endif /* !defined(_LP64) */ + +int +_aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence, + aio_result_t *resultp, int mode) +{ + aio_req_t *reqp; + aio_args_t *ap; + offset_t loffset; + struct stat stat; + int error = 0; + int kerr; + int umode; + + switch (whence) { + + case SEEK_SET: + loffset = offset; + break; + case SEEK_CUR: + if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1) + error = -1; + else + loffset += offset; + break; + case SEEK_END: + if (fstat(fd, &stat) == -1) + error = -1; + else + loffset = offset + stat.st_size; + break; + default: + errno = EINVAL; + error = -1; + } + + if (error) + return (error); + + /* initialize kaio */ + if (!_kaio_ok) + _kaio_init(); + + /* + * _aio_do_request() needs the original request code (mode) to be able + * to choose the appropiate 32/64 bit function. All other functions + * only require the difference between READ and WRITE (umode). + */ + if (mode == AIOAREAD64 || mode == AIOAWRITE64) + umode = mode - AIOAREAD64; + else + umode = mode; + + /* + * Try kernel aio first. + * If errno is ENOTSUP/EBADFD, fall back to the thread implementation. + */ + if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) { + resultp->aio_errno = 0; + sig_mutex_lock(&__aio_mutex); + _kaio_outstand_cnt++; + kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ? + (umode | AIO_POLL_BIT) : umode), + fd, buf, bufsz, loffset, resultp); + if (kerr == 0) { + sig_mutex_unlock(&__aio_mutex); + return (0); + } + _kaio_outstand_cnt--; + sig_mutex_unlock(&__aio_mutex); + if (errno != ENOTSUP && errno != EBADFD) + return (-1); + if (errno == EBADFD) + SET_KAIO_NOT_SUPPORTED(fd); + } + + if (!__uaio_ok && __uaio_init() == -1) + return (-1); + + if ((reqp = _aio_req_alloc()) == NULL) { + errno = EAGAIN; + return (-1); + } + + /* + * _aio_do_request() checks reqp->req_op to differentiate + * between 32 and 64 bit access. + */ + reqp->req_op = mode; + reqp->req_resultp = resultp; + ap = &reqp->req_args; + ap->fd = fd; + ap->buf = buf; + ap->bufsz = bufsz; + ap->offset = loffset; + + if (_aio_hash_insert(resultp, reqp) != 0) { + _aio_req_free(reqp); + errno = EINVAL; + return (-1); + } + /* + * _aio_req_add() only needs the difference between READ and + * WRITE to choose the right worker queue. + */ + _aio_req_add(reqp, &__nextworker_rw, umode); + return (0); +} + +int +aiocancel(aio_result_t *resultp) +{ + aio_req_t *reqp; + aio_worker_t *aiowp; + int ret; + int done = 0; + int canceled = 0; + + if (!__uaio_ok) { + errno = EINVAL; + return (-1); + } + + sig_mutex_lock(&__aio_mutex); + reqp = _aio_hash_find(resultp); + if (reqp == NULL) { + if (_aio_outstand_cnt == _aio_req_done_cnt) + errno = EINVAL; + else + errno = EACCES; + ret = -1; + } else { + aiowp = reqp->req_worker; + sig_mutex_lock(&aiowp->work_qlock1); + (void) _aio_cancel_req(aiowp, reqp, &canceled, &done); + sig_mutex_unlock(&aiowp->work_qlock1); + + if (canceled) { + ret = 0; + } else { + if (_aio_outstand_cnt == 0 || + _aio_outstand_cnt == _aio_req_done_cnt) + errno = EINVAL; + else + errno = EACCES; + ret = -1; + } + } + sig_mutex_unlock(&__aio_mutex); + return (ret); +} + +/* + * This must be asynch safe + */ +aio_result_t * +aiowait(struct timeval *uwait) +{ + aio_result_t *uresultp; + aio_result_t *kresultp; + aio_result_t *resultp; + int dontblock; + int timedwait = 0; + int kaio_errno = 0; + struct timeval twait; + struct timeval *wait = NULL; + hrtime_t hrtend; + hrtime_t hres; + + if (uwait) { + /* + * Check for a valid specified wait time. + * If it is invalid, fail the call right away. + */ + if (uwait->tv_sec < 0 || uwait->tv_usec < 0 || + uwait->tv_usec >= MICROSEC) { + errno = EINVAL; + return ((aio_result_t *)-1); + } + + if (uwait->tv_sec > 0 || uwait->tv_usec > 0) { + hrtend = gethrtime() + + (hrtime_t)uwait->tv_sec * NANOSEC + + (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC); + twait = *uwait; + wait = &twait; + timedwait++; + } else { + /* polling */ + sig_mutex_lock(&__aio_mutex); + if (_kaio_outstand_cnt == 0) { + kresultp = (aio_result_t *)-1; + } else { + kresultp = (aio_result_t *)_kaio(AIOWAIT, + (struct timeval *)-1, 1); + if (kresultp != (aio_result_t *)-1 && + kresultp != NULL && + kresultp != (aio_result_t *)1) { + _kaio_outstand_cnt--; + sig_mutex_unlock(&__aio_mutex); + return (kresultp); + } + } + uresultp = _aio_req_done(); + sig_mutex_unlock(&__aio_mutex); + if (uresultp != NULL && + uresultp != (aio_result_t *)-1) { + return (uresultp); + } + if (uresultp == (aio_result_t *)-1 && + kresultp == (aio_result_t *)-1) { + errno = EINVAL; + return ((aio_result_t *)-1); + } else { + return (NULL); + } + } + } + + for (;;) { + sig_mutex_lock(&__aio_mutex); + uresultp = _aio_req_done(); + if (uresultp != NULL && uresultp != (aio_result_t *)-1) { + sig_mutex_unlock(&__aio_mutex); + resultp = uresultp; + break; + } + _aiowait_flag++; + dontblock = (uresultp == (aio_result_t *)-1); + if (dontblock && _kaio_outstand_cnt == 0) { + kresultp = (aio_result_t *)-1; + kaio_errno = EINVAL; + } else { + sig_mutex_unlock(&__aio_mutex); + kresultp = (aio_result_t *)_kaio(AIOWAIT, + wait, dontblock); + sig_mutex_lock(&__aio_mutex); + kaio_errno = errno; + } + _aiowait_flag--; + sig_mutex_unlock(&__aio_mutex); + if (kresultp == (aio_result_t *)1) { + /* aiowait() awakened by an aionotify() */ + continue; + } else if (kresultp != NULL && + kresultp != (aio_result_t *)-1) { + resultp = kresultp; + sig_mutex_lock(&__aio_mutex); + _kaio_outstand_cnt--; + sig_mutex_unlock(&__aio_mutex); + break; + } else if (kresultp == (aio_result_t *)-1 && + kaio_errno == EINVAL && + uresultp == (aio_result_t *)-1) { + errno = kaio_errno; + resultp = (aio_result_t *)-1; + break; + } else if (kresultp == (aio_result_t *)-1 && + kaio_errno == EINTR) { + errno = kaio_errno; + resultp = (aio_result_t *)-1; + break; + } else if (timedwait) { + hres = hrtend - gethrtime(); + if (hres <= 0) { + /* time is up; return */ + resultp = NULL; + break; + } else { + /* + * Some time left. Round up the remaining time + * in nanoseconds to microsec. Retry the call. + */ + hres += (NANOSEC / MICROSEC) - 1; + wait->tv_sec = hres / NANOSEC; + wait->tv_usec = + (hres % NANOSEC) / (NANOSEC / MICROSEC); + } + } else { + ASSERT(kresultp == NULL && uresultp == NULL); + resultp = NULL; + continue; + } + } + return (resultp); +} + +/* + * _aio_get_timedelta calculates the remaining time and stores the result + * into timespec_t *wait. + */ + +int +_aio_get_timedelta(timespec_t *end, timespec_t *wait) +{ + int ret = 0; + struct timeval cur; + timespec_t curtime; + + (void) gettimeofday(&cur, NULL); + curtime.tv_sec = cur.tv_sec; + curtime.tv_nsec = cur.tv_usec * 1000; /* convert us to ns */ + + if (end->tv_sec >= curtime.tv_sec) { + wait->tv_sec = end->tv_sec - curtime.tv_sec; + if (end->tv_nsec >= curtime.tv_nsec) { + wait->tv_nsec = end->tv_nsec - curtime.tv_nsec; + if (wait->tv_sec == 0 && wait->tv_nsec == 0) + ret = -1; /* timer expired */ + } else { + if (end->tv_sec > curtime.tv_sec) { + wait->tv_sec -= 1; + wait->tv_nsec = NANOSEC - + (curtime.tv_nsec - end->tv_nsec); + } else { + ret = -1; /* timer expired */ + } + } + } else { + ret = -1; + } + return (ret); +} + +/* + * If closing by file descriptor: we will simply cancel all the outstanding + * aio`s and return. Those aio's in question will have either noticed the + * cancellation notice before, during, or after initiating io. + */ +int +aiocancel_all(int fd) +{ + aio_req_t *reqp; + aio_req_t **reqpp; + aio_worker_t *first; + aio_worker_t *next; + int canceled = 0; + int done = 0; + int cancelall = 0; + + sig_mutex_lock(&__aio_mutex); + + if (_aio_outstand_cnt == 0) { + sig_mutex_unlock(&__aio_mutex); + return (AIO_ALLDONE); + } + + /* + * Cancel requests from the read/write workers' queues. + */ + first = __nextworker_rw; + next = first; + do { + _aio_cancel_work(next, fd, &canceled, &done); + } while ((next = next->work_forw) != first); + + /* + * finally, check if there are requests on the done queue that + * should be canceled. + */ + if (fd < 0) + cancelall = 1; + reqpp = &_aio_done_tail; + while ((reqp = *reqpp) != NULL) { + if (cancelall || reqp->req_args.fd == fd) { + *reqpp = reqp->req_next; + _aio_donecnt--; + (void) _aio_hash_del(reqp->req_resultp); + _aio_req_free(reqp); + } else + reqpp = &reqp->req_next; + } + if (cancelall) { + ASSERT(_aio_donecnt == 0); + _aio_done_head = NULL; + } + sig_mutex_unlock(&__aio_mutex); + + if (canceled && done == 0) + return (AIO_CANCELED); + else if (done && canceled == 0) + return (AIO_ALLDONE); + else if ((canceled + done == 0) && KAIO_SUPPORTED(fd)) + return ((int)_kaio(AIOCANCEL, fd, NULL)); + return (AIO_NOTCANCELED); +} + +/* + * Cancel requests from a given work queue. If the file descriptor + * parameter, fd, is non-negative, then only cancel those requests + * in this queue that are to this file descriptor. If the fd + * parameter is -1, then cancel all requests. + */ +static void +_aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done) +{ + aio_req_t *reqp; + + sig_mutex_lock(&aiowp->work_qlock1); + /* + * cancel queued requests first. + */ + reqp = aiowp->work_tail1; + while (reqp != NULL) { + if (fd < 0 || reqp->req_args.fd == fd) { + if (_aio_cancel_req(aiowp, reqp, canceled, done)) { + /* + * Callers locks were dropped. + * reqp is invalid; start traversing + * the list from the beginning again. + */ + reqp = aiowp->work_tail1; + continue; + } + } + reqp = reqp->req_next; + } + /* + * Since the queued requests have been canceled, there can + * only be one inprogress request that should be canceled. + */ + if ((reqp = aiowp->work_req) != NULL && + (fd < 0 || reqp->req_args.fd == fd)) + (void) _aio_cancel_req(aiowp, reqp, canceled, done); + sig_mutex_unlock(&aiowp->work_qlock1); +} + +/* + * Cancel a request. Return 1 if the callers locks were temporarily + * dropped, otherwise return 0. + */ +int +_aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done) +{ + int ostate = reqp->req_state; + + ASSERT(MUTEX_HELD(&__aio_mutex)); + ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); + if (ostate == AIO_REQ_CANCELED) + return (0); + if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) { + (*done)++; + return (0); + } + if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) { + ASSERT(POSIX_AIO(reqp)); + /* Cancel the queued aio_fsync() request */ + if (!reqp->req_head->lio_canned) { + reqp->req_head->lio_canned = 1; + _aio_outstand_cnt--; + (*canceled)++; + } + return (0); + } + reqp->req_state = AIO_REQ_CANCELED; + _aio_req_del(aiowp, reqp, ostate); + (void) _aio_hash_del(reqp->req_resultp); + (*canceled)++; + if (reqp == aiowp->work_req) { + ASSERT(ostate == AIO_REQ_INPROGRESS); + /* + * Set the result values now, before _aiodone() is called. + * We do this because the application can expect aio_return + * and aio_errno to be set to -1 and ECANCELED, respectively, + * immediately after a successful return from aiocancel() + * or aio_cancel(). + */ + _aio_set_result(reqp, -1, ECANCELED); + (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL); + return (0); + } + if (!POSIX_AIO(reqp)) { + _aio_outstand_cnt--; + _aio_set_result(reqp, -1, ECANCELED); + return (0); + } + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_unlock(&__aio_mutex); + _aiodone(reqp, -1, ECANCELED); + sig_mutex_lock(&__aio_mutex); + sig_mutex_lock(&aiowp->work_qlock1); + return (1); +} + +int +_aio_create_worker(aio_req_t *reqp, int mode) +{ + aio_worker_t *aiowp, **workers, **nextworker; + int *aio_workerscnt; + void *(*func)(void *); + sigset_t oset; + int error; + + /* + * Put the new worker thread in the right queue. + */ + switch (mode) { + case AIOREAD: + case AIOWRITE: + case AIOAREAD: + case AIOAWRITE: +#if !defined(_LP64) + case AIOAREAD64: + case AIOAWRITE64: +#endif + workers = &__workers_rw; + nextworker = &__nextworker_rw; + aio_workerscnt = &__rw_workerscnt; + func = _aio_do_request; + break; + case AIONOTIFY: + workers = &__workers_no; + nextworker = &__nextworker_no; + func = _aio_do_notify; + aio_workerscnt = &__no_workerscnt; + break; + default: + aio_panic("_aio_create_worker: invalid mode"); + break; + } + + if ((aiowp = _aio_worker_alloc()) == NULL) + return (-1); + + if (reqp) { + reqp->req_state = AIO_REQ_QUEUED; + reqp->req_worker = aiowp; + aiowp->work_head1 = reqp; + aiowp->work_tail1 = reqp; + aiowp->work_next1 = reqp; + aiowp->work_count1 = 1; + aiowp->work_minload1 = 1; + } + + (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); + error = thr_create(NULL, AIOSTKSIZE, func, aiowp, + THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid); + (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); + if (error) { + if (reqp) { + reqp->req_state = 0; + reqp->req_worker = NULL; + } + _aio_worker_free(aiowp); + return (-1); + } + + lmutex_lock(&__aio_mutex); + (*aio_workerscnt)++; + if (*workers == NULL) { + aiowp->work_forw = aiowp; + aiowp->work_backw = aiowp; + *nextworker = aiowp; + *workers = aiowp; + } else { + aiowp->work_backw = (*workers)->work_backw; + aiowp->work_forw = (*workers); + (*workers)->work_backw->work_forw = aiowp; + (*workers)->work_backw = aiowp; + } + _aio_worker_cnt++; + lmutex_unlock(&__aio_mutex); + + (void) thr_continue(aiowp->work_tid); + + return (0); +} + +/* + * This is the worker's main routine. + * The task of this function is to execute all queued requests; + * once the last pending request is executed this function will block + * in _aio_idle(). A new incoming request must wakeup this thread to + * restart the work. + * Every worker has an own work queue. The queue lock is required + * to synchronize the addition of new requests for this worker or + * cancellation of pending/running requests. + * + * Cancellation scenarios: + * The cancellation of a request is being done asynchronously using + * _aio_cancel_req() from another thread context. + * A queued request can be cancelled in different manners : + * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED): + * - lock the queue -> remove the request -> unlock the queue + * - this function/thread does not detect this cancellation process + * b) request is in progress (AIO_REQ_INPROGRESS) : + * - this function first allow the cancellation of the running + * request with the flag "work_cancel_flg=1" + * see _aio_req_get() -> _aio_cancel_on() + * During this phase, it is allowed to interrupt the worker + * thread running the request (this thread) using the SIGAIOCANCEL + * signal. + * Once this thread returns from the kernel (because the request + * is just done), then it must disable a possible cancellation + * and proceed to finish the request. To disable the cancellation + * this thread must use _aio_cancel_off() to set "work_cancel_flg=0". + * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ): + * same procedure as in a) + * + * To b) + * This thread uses sigsetjmp() to define the position in the code, where + * it wish to continue working in the case that a SIGAIOCANCEL signal + * is detected. + * Normally this thread should get the cancellation signal during the + * kernel phase (reading or writing). In that case the signal handler + * aiosigcancelhndlr() is activated using the worker thread context, + * which again will use the siglongjmp() function to break the standard + * code flow and jump to the "sigsetjmp" position, provided that + * "work_cancel_flg" is set to "1". + * Because the "work_cancel_flg" is only manipulated by this worker + * thread and it can only run on one CPU at a given time, it is not + * necessary to protect that flag with the queue lock. + * Returning from the kernel (read or write system call) we must + * first disable the use of the SIGAIOCANCEL signal and accordingly + * the use of the siglongjmp() function to prevent a possible deadlock: + * - It can happens that this worker thread returns from the kernel and + * blocks in "work_qlock1", + * - then a second thread cancels the apparently "in progress" request + * and sends the SIGAIOCANCEL signal to the worker thread, + * - the worker thread gets assigned the "work_qlock1" and will returns + * from the kernel, + * - the kernel detects the pending signal and activates the signal + * handler instead, + * - if the "work_cancel_flg" is still set then the signal handler + * should use siglongjmp() to cancel the "in progress" request and + * it would try to acquire the same work_qlock1 in _aio_req_get() + * for a second time => deadlock. + * To avoid that situation we disable the cancellation of the request + * in progress BEFORE we try to acquire the work_qlock1. + * In that case the signal handler will not call siglongjmp() and the + * worker thread will continue running the standard code flow. + * Then this thread must check the AIO_REQ_CANCELED flag to emulate + * an eventually required siglongjmp() freeing the work_qlock1 and + * avoiding a deadlock. + */ +void * +_aio_do_request(void *arglist) +{ + aio_worker_t *aiowp = (aio_worker_t *)arglist; + ulwp_t *self = curthread; + struct aio_args *arg; + aio_req_t *reqp; /* current AIO request */ + ssize_t retval; + int error; + + if (pthread_setspecific(_aio_key, aiowp) != 0) + aio_panic("_aio_do_request, pthread_setspecific()"); + (void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL); + ASSERT(aiowp->work_req == NULL); + + /* + * We resume here when an operation is cancelled. + * On first entry, aiowp->work_req == NULL, so all + * we do is block SIGAIOCANCEL. + */ + (void) sigsetjmp(aiowp->work_jmp_buf, 0); + ASSERT(self->ul_sigdefer == 0); + + sigoff(self); /* block SIGAIOCANCEL */ + if (aiowp->work_req != NULL) + _aio_finish_request(aiowp, -1, ECANCELED); + + for (;;) { + /* + * Put completed requests on aio_done_list. This has + * to be done as part of the main loop to ensure that + * we don't artificially starve any aiowait'ers. + */ + if (aiowp->work_done1) + _aio_work_done(aiowp); + +top: + /* consume any deferred SIGAIOCANCEL signal here */ + sigon(self); + sigoff(self); + + while ((reqp = _aio_req_get(aiowp)) == NULL) { + if (_aio_idle(aiowp) != 0) + goto top; + } + arg = &reqp->req_args; + ASSERT(reqp->req_state == AIO_REQ_INPROGRESS || + reqp->req_state == AIO_REQ_CANCELED); + error = 0; + + switch (reqp->req_op) { + case AIOREAD: + case AIOAREAD: + sigon(self); /* unblock SIGAIOCANCEL */ + retval = pread(arg->fd, arg->buf, + arg->bufsz, arg->offset); + if (retval == -1) { + if (errno == ESPIPE) { + retval = read(arg->fd, + arg->buf, arg->bufsz); + if (retval == -1) + error = errno; + } else { + error = errno; + } + } + sigoff(self); /* block SIGAIOCANCEL */ + break; + case AIOWRITE: + case AIOAWRITE: + sigon(self); /* unblock SIGAIOCANCEL */ + retval = pwrite(arg->fd, arg->buf, + arg->bufsz, arg->offset); + if (retval == -1) { + if (errno == ESPIPE) { + retval = write(arg->fd, + arg->buf, arg->bufsz); + if (retval == -1) + error = errno; + } else { + error = errno; + } + } + sigoff(self); /* block SIGAIOCANCEL */ + break; +#if !defined(_LP64) + case AIOAREAD64: + sigon(self); /* unblock SIGAIOCANCEL */ + retval = pread64(arg->fd, arg->buf, + arg->bufsz, arg->offset); + if (retval == -1) { + if (errno == ESPIPE) { + retval = read(arg->fd, + arg->buf, arg->bufsz); + if (retval == -1) + error = errno; + } else { + error = errno; + } + } + sigoff(self); /* block SIGAIOCANCEL */ + break; + case AIOAWRITE64: + sigon(self); /* unblock SIGAIOCANCEL */ + retval = pwrite64(arg->fd, arg->buf, + arg->bufsz, arg->offset); + if (retval == -1) { + if (errno == ESPIPE) { + retval = write(arg->fd, + arg->buf, arg->bufsz); + if (retval == -1) + error = errno; + } else { + error = errno; + } + } + sigoff(self); /* block SIGAIOCANCEL */ + break; +#endif /* !defined(_LP64) */ + case AIOFSYNC: + if (_aio_fsync_del(aiowp, reqp)) + goto top; + ASSERT(reqp->req_head == NULL); + /* + * All writes for this fsync request are now + * acknowledged. Now make these writes visible + * and put the final request into the hash table. + */ + if (reqp->req_state == AIO_REQ_CANCELED) { + /* EMPTY */; + } else if (arg->offset == O_SYNC) { + if ((retval = __fdsync(arg->fd, FSYNC)) == -1) + error = errno; + } else { + if ((retval = __fdsync(arg->fd, FDSYNC)) == -1) + error = errno; + } + if (_aio_hash_insert(reqp->req_resultp, reqp) != 0) + aio_panic("_aio_do_request(): AIOFSYNC: " + "request already in hash table"); + break; + default: + aio_panic("_aio_do_request, bad op"); + } + + _aio_finish_request(aiowp, retval, error); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Perform the tail processing for _aio_do_request(). + * The in-progress request may or may not have been cancelled. + */ +static void +_aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error) +{ + aio_req_t *reqp; + + sig_mutex_lock(&aiowp->work_qlock1); + if ((reqp = aiowp->work_req) == NULL) + sig_mutex_unlock(&aiowp->work_qlock1); + else { + aiowp->work_req = NULL; + if (reqp->req_state == AIO_REQ_CANCELED) { + retval = -1; + error = ECANCELED; + } + if (!POSIX_AIO(reqp)) { + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_lock(&__aio_mutex); + if (reqp->req_state == AIO_REQ_INPROGRESS) + reqp->req_state = AIO_REQ_DONE; + _aio_req_done_cnt++; + _aio_set_result(reqp, retval, error); + if (error == ECANCELED) + _aio_outstand_cnt--; + sig_mutex_unlock(&__aio_mutex); + } else { + if (reqp->req_state == AIO_REQ_INPROGRESS) + reqp->req_state = AIO_REQ_DONE; + sig_mutex_unlock(&aiowp->work_qlock1); + _aiodone(reqp, retval, error); + } + } +} + +void +_aio_req_mark_done(aio_req_t *reqp) +{ +#if !defined(_LP64) + if (reqp->req_largefile) + ((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE; + else +#endif + ((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE; +} + +/* + * Sleep for 'ticks' clock ticks to give somebody else a chance to run, + * hopefully to consume one of our queued signals. + */ +static void +_aio_delay(int ticks) +{ + (void) usleep(ticks * (MICROSEC / hz)); +} + +/* + * Actually send the notifications. + * We could block indefinitely here if the application + * is not listening for the signal or port notifications. + */ +static void +send_notification(notif_param_t *npp) +{ + extern int __sigqueue(pid_t pid, int signo, + /* const union sigval */ void *value, int si_code, int block); + + if (npp->np_signo) + (void) __sigqueue(__pid, npp->np_signo, npp->np_user, + SI_ASYNCIO, 1); + else if (npp->np_port >= 0) + (void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO, + npp->np_event, npp->np_object, npp->np_user); + + if (npp->np_lio_signo) + (void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user, + SI_ASYNCIO, 1); + else if (npp->np_lio_port >= 0) + (void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO, + npp->np_lio_event, npp->np_lio_object, npp->np_lio_user); +} + +/* + * Asynchronous notification worker. + */ +void * +_aio_do_notify(void *arg) +{ + aio_worker_t *aiowp = (aio_worker_t *)arg; + aio_req_t *reqp; + + /* + * This isn't really necessary. All signals are blocked. + */ + if (pthread_setspecific(_aio_key, aiowp) != 0) + aio_panic("_aio_do_notify, pthread_setspecific()"); + + /* + * Notifications are never cancelled. + * All signals remain blocked, forever. + */ + for (;;) { + while ((reqp = _aio_req_get(aiowp)) == NULL) { + if (_aio_idle(aiowp) != 0) + aio_panic("_aio_do_notify: _aio_idle() failed"); + } + send_notification(&reqp->req_notify); + _aio_req_free(reqp); + } + + /* NOTREACHED */ + return (NULL); +} + +/* + * Do the completion semantics for a request that was either canceled + * by _aio_cancel_req() or was completed by _aio_do_request(). + */ +static void +_aiodone(aio_req_t *reqp, ssize_t retval, int error) +{ + aio_result_t *resultp = reqp->req_resultp; + int notify = 0; + aio_lio_t *head; + int sigev_none; + int sigev_signal; + int sigev_thread; + int sigev_port; + notif_param_t np; + + /* + * We call _aiodone() only for Posix I/O. + */ + ASSERT(POSIX_AIO(reqp)); + + sigev_none = 0; + sigev_signal = 0; + sigev_thread = 0; + sigev_port = 0; + np.np_signo = 0; + np.np_port = -1; + np.np_lio_signo = 0; + np.np_lio_port = -1; + + switch (reqp->req_sigevent.sigev_notify) { + case SIGEV_NONE: + sigev_none = 1; + break; + case SIGEV_SIGNAL: + sigev_signal = 1; + break; + case SIGEV_THREAD: + sigev_thread = 1; + break; + case SIGEV_PORT: + sigev_port = 1; + break; + default: + aio_panic("_aiodone: improper sigev_notify"); + break; + } + + /* + * Figure out the notification parameters while holding __aio_mutex. + * Actually perform the notifications after dropping __aio_mutex. + * This allows us to sleep for a long time (if the notifications + * incur delays) without impeding other async I/O operations. + */ + + sig_mutex_lock(&__aio_mutex); + + if (sigev_signal) { + if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0) + notify = 1; + np.np_user = reqp->req_sigevent.sigev_value.sival_ptr; + } else if (sigev_thread | sigev_port) { + if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0) + notify = 1; + np.np_event = reqp->req_op; + if (np.np_event == AIOFSYNC && reqp->req_largefile) + np.np_event = AIOFSYNC64; + np.np_object = (uintptr_t)reqp->req_aiocbp; + np.np_user = reqp->req_sigevent.sigev_value.sival_ptr; + } + + if (resultp->aio_errno == EINPROGRESS) + _aio_set_result(reqp, retval, error); + + _aio_outstand_cnt--; + + head = reqp->req_head; + reqp->req_head = NULL; + + if (sigev_none) { + _aio_enq_doneq(reqp); + reqp = NULL; + } else { + (void) _aio_hash_del(resultp); + _aio_req_mark_done(reqp); + } + + _aio_waitn_wakeup(); + + /* + * __aio_waitn() sets AIO_WAIT_INPROGRESS and + * __aio_suspend() increments "_aio_kernel_suspend" + * when they are waiting in the kernel for completed I/Os. + * + * _kaio(AIONOTIFY) awakes the corresponding function + * in the kernel; then the corresponding __aio_waitn() or + * __aio_suspend() function could reap the recently + * completed I/Os (_aiodone()). + */ + if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0) + (void) _kaio(AIONOTIFY); + + sig_mutex_unlock(&__aio_mutex); + + if (head != NULL) { + /* + * If all the lio requests have completed, + * prepare to notify the waiting thread. + */ + sig_mutex_lock(&head->lio_mutex); + ASSERT(head->lio_refcnt == head->lio_nent); + if (head->lio_refcnt == 1) { + int waiting = 0; + if (head->lio_mode == LIO_WAIT) { + if ((waiting = head->lio_waiting) != 0) + (void) cond_signal(&head->lio_cond_cv); + } else if (head->lio_port < 0) { /* none or signal */ + if ((np.np_lio_signo = head->lio_signo) != 0) + notify = 1; + np.np_lio_user = head->lio_sigval.sival_ptr; + } else { /* thread or port */ + notify = 1; + np.np_lio_port = head->lio_port; + np.np_lio_event = head->lio_event; + np.np_lio_object = + (uintptr_t)head->lio_sigevent; + np.np_lio_user = head->lio_sigval.sival_ptr; + } + head->lio_nent = head->lio_refcnt = 0; + sig_mutex_unlock(&head->lio_mutex); + if (waiting == 0) + _aio_lio_free(head); + } else { + head->lio_nent--; + head->lio_refcnt--; + sig_mutex_unlock(&head->lio_mutex); + } + } + + /* + * The request is completed; now perform the notifications. + */ + if (notify) { + if (reqp != NULL) { + /* + * We usually put the request on the notification + * queue because we don't want to block and delay + * other operations behind us in the work queue. + * Also we must never block on a cancel notification + * because we are being called from an application + * thread in this case and that could lead to deadlock + * if no other thread is receiving notificatins. + */ + reqp->req_notify = np; + reqp->req_op = AIONOTIFY; + _aio_req_add(reqp, &__workers_no, AIONOTIFY); + reqp = NULL; + } else { + /* + * We already put the request on the done queue, + * so we can't queue it to the notification queue. + * Just do the notification directly. + */ + send_notification(&np); + } + } + + if (reqp != NULL) + _aio_req_free(reqp); +} + +/* + * Delete fsync requests from list head until there is + * only one left. Return 0 when there is only one, + * otherwise return a non-zero value. + */ +static int +_aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp) +{ + aio_lio_t *head = reqp->req_head; + int rval = 0; + + ASSERT(reqp == aiowp->work_req); + sig_mutex_lock(&aiowp->work_qlock1); + sig_mutex_lock(&head->lio_mutex); + if (head->lio_refcnt > 1) { + head->lio_refcnt--; + head->lio_nent--; + aiowp->work_req = NULL; + sig_mutex_unlock(&head->lio_mutex); + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_lock(&__aio_mutex); + _aio_outstand_cnt--; + _aio_waitn_wakeup(); + sig_mutex_unlock(&__aio_mutex); + _aio_req_free(reqp); + return (1); + } + ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1); + reqp->req_head = NULL; + if (head->lio_canned) + reqp->req_state = AIO_REQ_CANCELED; + if (head->lio_mode == LIO_DESTROY) { + aiowp->work_req = NULL; + rval = 1; + } + sig_mutex_unlock(&head->lio_mutex); + sig_mutex_unlock(&aiowp->work_qlock1); + head->lio_refcnt--; + head->lio_nent--; + _aio_lio_free(head); + if (rval != 0) + _aio_req_free(reqp); + return (rval); +} + +/* + * A worker is set idle when its work queue is empty. + * The worker checks again that it has no more work + * and then goes to sleep waiting for more work. + */ +int +_aio_idle(aio_worker_t *aiowp) +{ + int error = 0; + + sig_mutex_lock(&aiowp->work_qlock1); + if (aiowp->work_count1 == 0) { + ASSERT(aiowp->work_minload1 == 0); + aiowp->work_idleflg = 1; + /* + * A cancellation handler is not needed here. + * aio worker threads are never cancelled via pthread_cancel(). + */ + error = sig_cond_wait(&aiowp->work_idle_cv, + &aiowp->work_qlock1); + /* + * The idle flag is normally cleared before worker is awakened + * by aio_req_add(). On error (EINTR), we clear it ourself. + */ + if (error) + aiowp->work_idleflg = 0; + } + sig_mutex_unlock(&aiowp->work_qlock1); + return (error); +} + +/* + * A worker's completed AIO requests are placed onto a global + * done queue. The application is only sent a SIGIO signal if + * the process has a handler enabled and it is not waiting via + * aiowait(). + */ +static void +_aio_work_done(aio_worker_t *aiowp) +{ + aio_req_t *reqp; + + sig_mutex_lock(&aiowp->work_qlock1); + reqp = aiowp->work_prev1; + reqp->req_next = NULL; + aiowp->work_done1 = 0; + aiowp->work_tail1 = aiowp->work_next1; + if (aiowp->work_tail1 == NULL) + aiowp->work_head1 = NULL; + aiowp->work_prev1 = NULL; + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_lock(&__aio_mutex); + _aio_donecnt++; + _aio_outstand_cnt--; + _aio_req_done_cnt--; + ASSERT(_aio_donecnt > 0 && + _aio_outstand_cnt >= 0 && + _aio_req_done_cnt >= 0); + ASSERT(reqp != NULL); + + if (_aio_done_tail == NULL) { + _aio_done_head = _aio_done_tail = reqp; + } else { + _aio_done_head->req_next = reqp; + _aio_done_head = reqp; + } + + if (_aiowait_flag) { + sig_mutex_unlock(&__aio_mutex); + (void) _kaio(AIONOTIFY); + } else { + sig_mutex_unlock(&__aio_mutex); + if (_sigio_enabled) + (void) kill(__pid, SIGIO); + } +} + +/* + * The done queue consists of AIO requests that are in either the + * AIO_REQ_DONE or AIO_REQ_CANCELED state. Requests that were cancelled + * are discarded. If the done queue is empty then NULL is returned. + * Otherwise the address of a done aio_result_t is returned. + */ +aio_result_t * +_aio_req_done(void) +{ + aio_req_t *reqp; + aio_result_t *resultp; + + ASSERT(MUTEX_HELD(&__aio_mutex)); + + if ((reqp = _aio_done_tail) != NULL) { + if ((_aio_done_tail = reqp->req_next) == NULL) + _aio_done_head = NULL; + ASSERT(_aio_donecnt > 0); + _aio_donecnt--; + (void) _aio_hash_del(reqp->req_resultp); + resultp = reqp->req_resultp; + ASSERT(reqp->req_state == AIO_REQ_DONE); + _aio_req_free(reqp); + return (resultp); + } + /* is queue empty? */ + if (reqp == NULL && _aio_outstand_cnt == 0) { + return ((aio_result_t *)-1); + } + return (NULL); +} + +/* + * Set the return and errno values for the application's use. + * + * For the Posix interfaces, we must set the return value first followed + * by the errno value because the Posix interfaces allow for a change + * in the errno value from EINPROGRESS to something else to signal + * the completion of the asynchronous request. + * + * The opposite is true for the Solaris interfaces. These allow for + * a change in the return value from AIO_INPROGRESS to something else + * to signal the completion of the asynchronous request. + */ +void +_aio_set_result(aio_req_t *reqp, ssize_t retval, int error) +{ + aio_result_t *resultp = reqp->req_resultp; + + if (POSIX_AIO(reqp)) { + resultp->aio_return = retval; + membar_producer(); + resultp->aio_errno = error; + } else { + resultp->aio_errno = error; + membar_producer(); + resultp->aio_return = retval; + } +} + +/* + * Add an AIO request onto the next work queue. + * A circular list of workers is used to choose the next worker. + */ +void +_aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode) +{ + ulwp_t *self = curthread; + aio_worker_t *aiowp; + aio_worker_t *first; + int load_bal_flg = 1; + int found; + + ASSERT(reqp->req_state != AIO_REQ_DONEQ); + reqp->req_next = NULL; + /* + * Try to acquire the next worker's work queue. If it is locked, + * then search the list of workers until a queue is found unlocked, + * or until the list is completely traversed at which point another + * worker will be created. + */ + sigoff(self); /* defer SIGIO */ + sig_mutex_lock(&__aio_mutex); + first = aiowp = *nextworker; + if (mode != AIONOTIFY) + _aio_outstand_cnt++; + sig_mutex_unlock(&__aio_mutex); + + switch (mode) { + case AIOREAD: + case AIOWRITE: + case AIOAREAD: + case AIOAWRITE: +#if !defined(_LP64) + case AIOAREAD64: + case AIOAWRITE64: +#endif + /* try to find an idle worker */ + found = 0; + do { + if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) { + if (aiowp->work_idleflg) { + found = 1; + break; + } + sig_mutex_unlock(&aiowp->work_qlock1); + } + } while ((aiowp = aiowp->work_forw) != first); + + if (found) { + aiowp->work_minload1++; + break; + } + + /* try to acquire some worker's queue lock */ + do { + if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) { + found = 1; + break; + } + } while ((aiowp = aiowp->work_forw) != first); + + /* + * Create more workers when the workers appear overloaded. + * Either all the workers are busy draining their queues + * or no worker's queue lock could be acquired. + */ + if (!found) { + if (_aio_worker_cnt < _max_workers) { + if (_aio_create_worker(reqp, mode)) + aio_panic("_aio_req_add: add worker"); + sigon(self); /* reenable SIGIO */ + return; + } + + /* + * No worker available and we have created + * _max_workers, keep going through the + * list slowly until we get a lock + */ + while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) { + /* + * give someone else a chance + */ + _aio_delay(1); + aiowp = aiowp->work_forw; + } + } + + ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); + if (_aio_worker_cnt < _max_workers && + aiowp->work_minload1 >= _minworkload) { + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_lock(&__aio_mutex); + *nextworker = aiowp->work_forw; + sig_mutex_unlock(&__aio_mutex); + if (_aio_create_worker(reqp, mode)) + aio_panic("aio_req_add: add worker"); + sigon(self); /* reenable SIGIO */ + return; + } + aiowp->work_minload1++; + break; + case AIOFSYNC: + case AIONOTIFY: + load_bal_flg = 0; + sig_mutex_lock(&aiowp->work_qlock1); + break; + default: + aio_panic("_aio_req_add: invalid mode"); + break; + } + /* + * Put request onto worker's work queue. + */ + if (aiowp->work_tail1 == NULL) { + ASSERT(aiowp->work_count1 == 0); + aiowp->work_tail1 = reqp; + aiowp->work_next1 = reqp; + } else { + aiowp->work_head1->req_next = reqp; + if (aiowp->work_next1 == NULL) + aiowp->work_next1 = reqp; + } + reqp->req_state = AIO_REQ_QUEUED; + reqp->req_worker = aiowp; + aiowp->work_head1 = reqp; + /* + * Awaken worker if it is not currently active. + */ + if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) { + aiowp->work_idleflg = 0; + (void) cond_signal(&aiowp->work_idle_cv); + } + sig_mutex_unlock(&aiowp->work_qlock1); + + if (load_bal_flg) { + sig_mutex_lock(&__aio_mutex); + *nextworker = aiowp->work_forw; + sig_mutex_unlock(&__aio_mutex); + } + sigon(self); /* reenable SIGIO */ +} + +/* + * Get an AIO request for a specified worker. + * If the work queue is empty, return NULL. + */ +aio_req_t * +_aio_req_get(aio_worker_t *aiowp) +{ + aio_req_t *reqp; + + sig_mutex_lock(&aiowp->work_qlock1); + if ((reqp = aiowp->work_next1) != NULL) { + /* + * Remove a POSIX request from the queue; the + * request queue is a singularly linked list + * with a previous pointer. The request is + * removed by updating the previous pointer. + * + * Non-posix requests are left on the queue + * to eventually be placed on the done queue. + */ + + if (POSIX_AIO(reqp)) { + if (aiowp->work_prev1 == NULL) { + aiowp->work_tail1 = reqp->req_next; + if (aiowp->work_tail1 == NULL) + aiowp->work_head1 = NULL; + } else { + aiowp->work_prev1->req_next = reqp->req_next; + if (aiowp->work_head1 == reqp) + aiowp->work_head1 = reqp->req_next; + } + + } else { + aiowp->work_prev1 = reqp; + ASSERT(aiowp->work_done1 >= 0); + aiowp->work_done1++; + } + ASSERT(reqp != reqp->req_next); + aiowp->work_next1 = reqp->req_next; + ASSERT(aiowp->work_count1 >= 1); + aiowp->work_count1--; + switch (reqp->req_op) { + case AIOREAD: + case AIOWRITE: + case AIOAREAD: + case AIOAWRITE: +#if !defined(_LP64) + case AIOAREAD64: + case AIOAWRITE64: +#endif + ASSERT(aiowp->work_minload1 > 0); + aiowp->work_minload1--; + break; + } + reqp->req_state = AIO_REQ_INPROGRESS; + } + aiowp->work_req = reqp; + ASSERT(reqp != NULL || aiowp->work_count1 == 0); + sig_mutex_unlock(&aiowp->work_qlock1); + return (reqp); +} + +static void +_aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate) +{ + aio_req_t **last; + aio_req_t *lastrp; + aio_req_t *next; + + ASSERT(aiowp != NULL); + ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); + if (POSIX_AIO(reqp)) { + if (ostate != AIO_REQ_QUEUED) + return; + } + last = &aiowp->work_tail1; + lastrp = aiowp->work_tail1; + ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS); + while ((next = *last) != NULL) { + if (next == reqp) { + *last = next->req_next; + if (aiowp->work_next1 == next) + aiowp->work_next1 = next->req_next; + + if ((next->req_next != NULL) || + (aiowp->work_done1 == 0)) { + if (aiowp->work_head1 == next) + aiowp->work_head1 = next->req_next; + if (aiowp->work_prev1 == next) + aiowp->work_prev1 = next->req_next; + } else { + if (aiowp->work_head1 == next) + aiowp->work_head1 = lastrp; + if (aiowp->work_prev1 == next) + aiowp->work_prev1 = lastrp; + } + + if (ostate == AIO_REQ_QUEUED) { + ASSERT(aiowp->work_count1 >= 1); + aiowp->work_count1--; + ASSERT(aiowp->work_minload1 >= 1); + aiowp->work_minload1--; + } else { + ASSERT(ostate == AIO_REQ_INPROGRESS && + !POSIX_AIO(reqp)); + aiowp->work_done1--; + } + return; + } + last = &next->req_next; + lastrp = next; + } + /* NOTREACHED */ +} + +static void +_aio_enq_doneq(aio_req_t *reqp) +{ + if (_aio_doneq == NULL) { + _aio_doneq = reqp; + reqp->req_next = reqp->req_prev = reqp; + } else { + reqp->req_next = _aio_doneq; + reqp->req_prev = _aio_doneq->req_prev; + _aio_doneq->req_prev->req_next = reqp; + _aio_doneq->req_prev = reqp; + } + reqp->req_state = AIO_REQ_DONEQ; + _aio_doneq_cnt++; +} + +/* + * caller owns the _aio_mutex + */ +aio_req_t * +_aio_req_remove(aio_req_t *reqp) +{ + if (reqp && reqp->req_state != AIO_REQ_DONEQ) + return (NULL); + + if (reqp) { + /* request in done queue */ + if (_aio_doneq == reqp) + _aio_doneq = reqp->req_next; + if (_aio_doneq == reqp) { + /* only one request on queue */ + _aio_doneq = NULL; + } else { + aio_req_t *tmp = reqp->req_next; + reqp->req_prev->req_next = tmp; + tmp->req_prev = reqp->req_prev; + } + } else if ((reqp = _aio_doneq) != NULL) { + if (reqp == reqp->req_next) { + /* only one request on queue */ + _aio_doneq = NULL; + } else { + reqp->req_prev->req_next = _aio_doneq = reqp->req_next; + _aio_doneq->req_prev = reqp->req_prev; + } + } + if (reqp) { + _aio_doneq_cnt--; + reqp->req_next = reqp->req_prev = reqp; + reqp->req_state = AIO_REQ_DONE; + } + return (reqp); +} + +/* + * An AIO request is identified by an aio_result_t pointer. The library + * maps this aio_result_t pointer to its internal representation using a + * hash table. This function adds an aio_result_t pointer to the hash table. + */ +static int +_aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp) +{ + aio_hash_t *hashp; + aio_req_t **prev; + aio_req_t *next; + + hashp = _aio_hash + AIOHASH(resultp); + lmutex_lock(&hashp->hash_lock); + prev = &hashp->hash_ptr; + while ((next = *prev) != NULL) { + if (resultp == next->req_resultp) { + lmutex_unlock(&hashp->hash_lock); + return (-1); + } + prev = &next->req_link; + } + *prev = reqp; + ASSERT(reqp->req_link == NULL); + lmutex_unlock(&hashp->hash_lock); + return (0); +} + +/* + * Remove an entry from the hash table. + */ +aio_req_t * +_aio_hash_del(aio_result_t *resultp) +{ + aio_hash_t *hashp; + aio_req_t **prev; + aio_req_t *next = NULL; + + if (_aio_hash != NULL) { + hashp = _aio_hash + AIOHASH(resultp); + lmutex_lock(&hashp->hash_lock); + prev = &hashp->hash_ptr; + while ((next = *prev) != NULL) { + if (resultp == next->req_resultp) { + *prev = next->req_link; + next->req_link = NULL; + break; + } + prev = &next->req_link; + } + lmutex_unlock(&hashp->hash_lock); + } + return (next); +} + +/* + * find an entry in the hash table + */ +aio_req_t * +_aio_hash_find(aio_result_t *resultp) +{ + aio_hash_t *hashp; + aio_req_t **prev; + aio_req_t *next = NULL; + + if (_aio_hash != NULL) { + hashp = _aio_hash + AIOHASH(resultp); + lmutex_lock(&hashp->hash_lock); + prev = &hashp->hash_ptr; + while ((next = *prev) != NULL) { + if (resultp == next->req_resultp) + break; + prev = &next->req_link; + } + lmutex_unlock(&hashp->hash_lock); + } + return (next); +} + +/* + * AIO interface for POSIX + */ +int +_aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker, + int mode, int flg) +{ + aio_req_t *reqp; + aio_args_t *ap; + int kerr; + + if (aiocbp == NULL) { + errno = EINVAL; + return (-1); + } + + /* initialize kaio */ + if (!_kaio_ok) + _kaio_init(); + + aiocbp->aio_state = NOCHECK; + + /* + * If we have been called because a list I/O + * kaio() failed, we dont want to repeat the + * system call + */ + + if (flg & AIO_KAIO) { + /* + * Try kernel aio first. + * If errno is ENOTSUP/EBADFD, + * fall back to the thread implementation. + */ + if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) { + aiocbp->aio_resultp.aio_errno = EINPROGRESS; + aiocbp->aio_state = CHECK; + kerr = (int)_kaio(mode, aiocbp); + if (kerr == 0) + return (0); + if (errno != ENOTSUP && errno != EBADFD) { + aiocbp->aio_resultp.aio_errno = errno; + aiocbp->aio_resultp.aio_return = -1; + aiocbp->aio_state = NOCHECK; + return (-1); + } + if (errno == EBADFD) + SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); + } + } + + aiocbp->aio_resultp.aio_errno = EINPROGRESS; + aiocbp->aio_state = USERAIO; + + if (!__uaio_ok && __uaio_init() == -1) + return (-1); + + if ((reqp = _aio_req_alloc()) == NULL) { + errno = EAGAIN; + return (-1); + } + + /* + * If an LIO request, add the list head to the aio request + */ + reqp->req_head = lio_head; + reqp->req_type = AIO_POSIX_REQ; + reqp->req_op = mode; + reqp->req_largefile = 0; + + if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) { + reqp->req_sigevent.sigev_notify = SIGEV_NONE; + } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { + reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL; + reqp->req_sigevent.sigev_signo = + aiocbp->aio_sigevent.sigev_signo; + reqp->req_sigevent.sigev_value.sival_ptr = + aiocbp->aio_sigevent.sigev_value.sival_ptr; + } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) { + port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr; + reqp->req_sigevent.sigev_notify = SIGEV_PORT; + /* + * Reuse the sigevent structure to contain the port number + * and the user value. Same for SIGEV_THREAD, below. + */ + reqp->req_sigevent.sigev_signo = + pn->portnfy_port; + reqp->req_sigevent.sigev_value.sival_ptr = + pn->portnfy_user; + } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) { + reqp->req_sigevent.sigev_notify = SIGEV_THREAD; + /* + * The sigevent structure contains the port number + * and the user value. Same for SIGEV_PORT, above. + */ + reqp->req_sigevent.sigev_signo = + aiocbp->aio_sigevent.sigev_signo; + reqp->req_sigevent.sigev_value.sival_ptr = + aiocbp->aio_sigevent.sigev_value.sival_ptr; + } + + reqp->req_resultp = &aiocbp->aio_resultp; + reqp->req_aiocbp = aiocbp; + ap = &reqp->req_args; + ap->fd = aiocbp->aio_fildes; + ap->buf = (caddr_t)aiocbp->aio_buf; + ap->bufsz = aiocbp->aio_nbytes; + ap->offset = aiocbp->aio_offset; + + if ((flg & AIO_NO_DUPS) && + _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) { + aio_panic("_aio_rw(): request already in hash table"); + _aio_req_free(reqp); + errno = EINVAL; + return (-1); + } + _aio_req_add(reqp, nextworker, mode); + return (0); +} + +#if !defined(_LP64) +/* + * 64-bit AIO interface for POSIX + */ +int +_aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker, + int mode, int flg) +{ + aio_req_t *reqp; + aio_args_t *ap; + int kerr; + + if (aiocbp == NULL) { + errno = EINVAL; + return (-1); + } + + /* initialize kaio */ + if (!_kaio_ok) + _kaio_init(); + + aiocbp->aio_state = NOCHECK; + + /* + * If we have been called because a list I/O + * kaio() failed, we dont want to repeat the + * system call + */ + + if (flg & AIO_KAIO) { + /* + * Try kernel aio first. + * If errno is ENOTSUP/EBADFD, + * fall back to the thread implementation. + */ + if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) { + aiocbp->aio_resultp.aio_errno = EINPROGRESS; + aiocbp->aio_state = CHECK; + kerr = (int)_kaio(mode, aiocbp); + if (kerr == 0) + return (0); + if (errno != ENOTSUP && errno != EBADFD) { + aiocbp->aio_resultp.aio_errno = errno; + aiocbp->aio_resultp.aio_return = -1; + aiocbp->aio_state = NOCHECK; + return (-1); + } + if (errno == EBADFD) + SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); + } + } + + aiocbp->aio_resultp.aio_errno = EINPROGRESS; + aiocbp->aio_state = USERAIO; + + if (!__uaio_ok && __uaio_init() == -1) + return (-1); + + if ((reqp = _aio_req_alloc()) == NULL) { + errno = EAGAIN; + return (-1); + } + + /* + * If an LIO request, add the list head to the aio request + */ + reqp->req_head = lio_head; + reqp->req_type = AIO_POSIX_REQ; + reqp->req_op = mode; + reqp->req_largefile = 1; + + if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) { + reqp->req_sigevent.sigev_notify = SIGEV_NONE; + } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { + reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL; + reqp->req_sigevent.sigev_signo = + aiocbp->aio_sigevent.sigev_signo; + reqp->req_sigevent.sigev_value.sival_ptr = + aiocbp->aio_sigevent.sigev_value.sival_ptr; + } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) { + port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr; + reqp->req_sigevent.sigev_notify = SIGEV_PORT; + reqp->req_sigevent.sigev_signo = + pn->portnfy_port; + reqp->req_sigevent.sigev_value.sival_ptr = + pn->portnfy_user; + } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) { + reqp->req_sigevent.sigev_notify = SIGEV_THREAD; + reqp->req_sigevent.sigev_signo = + aiocbp->aio_sigevent.sigev_signo; + reqp->req_sigevent.sigev_value.sival_ptr = + aiocbp->aio_sigevent.sigev_value.sival_ptr; + } + + reqp->req_resultp = &aiocbp->aio_resultp; + reqp->req_aiocbp = aiocbp; + ap = &reqp->req_args; + ap->fd = aiocbp->aio_fildes; + ap->buf = (caddr_t)aiocbp->aio_buf; + ap->bufsz = aiocbp->aio_nbytes; + ap->offset = aiocbp->aio_offset; + + if ((flg & AIO_NO_DUPS) && + _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) { + aio_panic("_aio_rw64(): request already in hash table"); + _aio_req_free(reqp); + errno = EINVAL; + return (-1); + } + _aio_req_add(reqp, nextworker, mode); + return (0); +} +#endif /* !defined(_LP64) */ diff --git a/usr/src/lib/libc/port/aio/aio_alloc.c b/usr/src/lib/libc/port/aio/aio_alloc.c new file mode 100644 index 0000000000..db919872e4 --- /dev/null +++ b/usr/src/lib/libc/port/aio/aio_alloc.c @@ -0,0 +1,435 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include "thr_uberdata.h" +#include "asyncio.h" + +/* + * The aio subsystem memory allocation strategy: + * + * For each of the structure types we wish to allocate/free + * (aio_worker_t, aio_req_t, aio_lio_t), we use mmap() to allocate + * chunks of memory which are then subdivided into individual + * elements which are put into a free list from which allocations + * are made and to which frees are returned. + * + * Chunks start small (8 Kbytes) and get larger (size doubling) + * as more chunks are needed. This keeps memory usage small for + * light use and fragmentation small for heavy use. + * + * Chunks are never unmapped except as an aftermath of fork() + * in the child process, when they are all unmapped (because + * all of the worker threads disappear in the child). + */ + +#define INITIAL_CHUNKSIZE (8 * 1024) + +/* + * The header structure for each chunk. + * A pointer and a size_t ensures proper alignment for whatever follows. + */ +typedef struct chunk { + struct chunk *chunk_next; /* linked list */ + size_t chunk_size; /* size of this chunk */ +} chunk_t; + +chunk_t *chunk_list = NULL; /* list of all chunks */ +mutex_t chunk_lock = DEFAULTMUTEX; + +chunk_t * +chunk_alloc(size_t size) +{ + chunk_t *chp = NULL; + void *ptr; + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, (off_t)0); + if (ptr != MAP_FAILED) { + lmutex_lock(&chunk_lock); + chp = ptr; + chp->chunk_next = chunk_list; + chunk_list = chp; + chp->chunk_size = size; + lmutex_unlock(&chunk_lock); + } + + return (chp); +} + +aio_worker_t *worker_freelist = NULL; /* free list of worker structures */ +aio_worker_t *worker_freelast = NULL; +size_t worker_chunksize = 0; +mutex_t worker_lock = DEFAULTMUTEX; + +/* + * Allocate a worker control block. + */ +aio_worker_t * +_aio_worker_alloc(void) +{ + aio_worker_t *aiowp; + chunk_t *chp; + size_t chunksize; + int nelem; + int i; + + lmutex_lock(&worker_lock); + if ((aiowp = worker_freelist) == NULL) { + if ((chunksize = 2 * worker_chunksize) == 0) + chunksize = INITIAL_CHUNKSIZE; + if ((chp = chunk_alloc(chunksize)) == NULL) { + lmutex_unlock(&worker_lock); + return (NULL); + } + worker_chunksize = chunksize; + worker_freelist = (aio_worker_t *)(uintptr_t)(chp + 1); + nelem = (chunksize - sizeof (chunk_t)) / sizeof (aio_worker_t); + for (i = 0, aiowp = worker_freelist; i < nelem; i++, aiowp++) + aiowp->work_forw = aiowp + 1; + worker_freelast = aiowp - 1; + worker_freelast->work_forw = NULL; + aiowp = worker_freelist; + } + if ((worker_freelist = aiowp->work_forw) == NULL) + worker_freelast = NULL; + lmutex_unlock(&worker_lock); + + aiowp->work_forw = NULL; + (void) mutex_init(&aiowp->work_qlock1, USYNC_THREAD, NULL); + (void) cond_init(&aiowp->work_idle_cv, USYNC_THREAD, NULL); + + return (aiowp); +} + +/* + * Free a worker control block. + * Declared with void *arg so it can be a pthread_key_create() destructor. + */ +void +_aio_worker_free(void *arg) +{ + aio_worker_t *aiowp = arg; + + (void) mutex_destroy(&aiowp->work_qlock1); + (void) cond_destroy(&aiowp->work_idle_cv); + (void) memset(aiowp, 0, sizeof (*aiowp)); + + lmutex_lock(&worker_lock); + if (worker_freelast == NULL) { + worker_freelist = worker_freelast = aiowp; + } else { + worker_freelast->work_forw = aiowp; + worker_freelast = aiowp; + } + lmutex_unlock(&worker_lock); +} + +aio_req_t *_aio_freelist = NULL; /* free list of request structures */ +aio_req_t *_aio_freelast = NULL; +size_t request_chunksize = 0; +int _aio_freelist_cnt = 0; +int _aio_allocated_cnt = 0; +mutex_t __aio_cache_lock = DEFAULTMUTEX; + +/* + * Allocate an aio request structure. + */ +aio_req_t * +_aio_req_alloc(void) +{ + aio_req_t *reqp; + chunk_t *chp; + size_t chunksize; + int nelem; + int i; + + lmutex_lock(&__aio_cache_lock); + if ((reqp = _aio_freelist) == NULL) { + if ((chunksize = 2 * request_chunksize) == 0) + chunksize = INITIAL_CHUNKSIZE; + if ((chp = chunk_alloc(chunksize)) == NULL) { + lmutex_unlock(&__aio_cache_lock); + return (NULL); + } + request_chunksize = chunksize; + _aio_freelist = (aio_req_t *)(uintptr_t)(chp + 1); + nelem = (chunksize - sizeof (chunk_t)) / sizeof (aio_req_t); + for (i = 0, reqp = _aio_freelist; i < nelem; i++, reqp++) { + reqp->req_state = AIO_REQ_FREE; + reqp->req_link = reqp + 1; + } + _aio_freelast = reqp - 1; + _aio_freelast->req_link = NULL; + _aio_freelist_cnt = nelem; + reqp = _aio_freelist; + } + if ((_aio_freelist = reqp->req_link) == NULL) + _aio_freelast = NULL; + _aio_freelist_cnt--; + _aio_allocated_cnt++; + lmutex_unlock(&__aio_cache_lock); + + ASSERT(reqp->req_state == AIO_REQ_FREE); + reqp->req_state = 0; + reqp->req_link = NULL; + reqp->req_sigevent.sigev_notify = SIGEV_NONE; + + return (reqp); +} + +/* + * Free an aio request structure. + */ +void +_aio_req_free(aio_req_t *reqp) +{ + ASSERT(reqp->req_state != AIO_REQ_FREE && + reqp->req_state != AIO_REQ_DONEQ); + (void) memset(reqp, 0, sizeof (*reqp)); + reqp->req_state = AIO_REQ_FREE; + + lmutex_lock(&__aio_cache_lock); + if (_aio_freelast == NULL) { + _aio_freelist = _aio_freelast = reqp; + } else { + _aio_freelast->req_link = reqp; + _aio_freelast = reqp; + } + _aio_freelist_cnt++; + _aio_allocated_cnt--; + lmutex_unlock(&__aio_cache_lock); +} + +aio_lio_t *_lio_head_freelist = NULL; /* free list of lio head structures */ +aio_lio_t *_lio_head_freelast = NULL; +size_t lio_head_chunksize = 0; +int _lio_alloc = 0; +int _lio_free = 0; +mutex_t __lio_mutex = DEFAULTMUTEX; + +/* + * Allocate a listio head structure. + */ +aio_lio_t * +_aio_lio_alloc(void) +{ + aio_lio_t *head; + chunk_t *chp; + size_t chunksize; + int nelem; + int i; + + lmutex_lock(&__lio_mutex); + if ((head = _lio_head_freelist) == NULL) { + if ((chunksize = 2 * lio_head_chunksize) == 0) + chunksize = INITIAL_CHUNKSIZE; + if ((chp = chunk_alloc(chunksize)) == NULL) { + lmutex_unlock(&__lio_mutex); + return (NULL); + } + lio_head_chunksize = chunksize; + _lio_head_freelist = (aio_lio_t *)(uintptr_t)(chp + 1); + nelem = (chunksize - sizeof (chunk_t)) / sizeof (aio_lio_t); + for (i = 0, head = _lio_head_freelist; i < nelem; i++, head++) + head->lio_next = head + 1; + _lio_head_freelast = head - 1; + _lio_head_freelast->lio_next = NULL; + _lio_alloc += nelem; + _lio_free = nelem; + head = _lio_head_freelist; + } + if ((_lio_head_freelist = head->lio_next) == NULL) + _lio_head_freelast = NULL; + _lio_free--; + lmutex_unlock(&__lio_mutex); + + ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0); + head->lio_next = NULL; + head->lio_port = -1; + (void) mutex_init(&head->lio_mutex, USYNC_THREAD, NULL); + (void) cond_init(&head->lio_cond_cv, USYNC_THREAD, NULL); + + return (head); +} + +/* + * Free a listio head structure. + */ +void +_aio_lio_free(aio_lio_t *head) +{ + ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0); + (void) mutex_destroy(&head->lio_mutex); + (void) cond_destroy(&head->lio_cond_cv); + (void) memset(head, 0, sizeof (*head)); + + lmutex_lock(&__lio_mutex); + if (_lio_head_freelast == NULL) { + _lio_head_freelist = _lio_head_freelast = head; + } else { + _lio_head_freelast->lio_next = head; + _lio_head_freelast = head; + } + _lio_free++; + lmutex_unlock(&__lio_mutex); +} + +void +postfork1_child_aio(void) +{ + chunk_t *chp; + + /* + * All of the workers are gone; free their structures. + */ + if (_kaio_supported != NULL) { + (void) munmap((void *)_kaio_supported, + MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t)); + _kaio_supported = NULL; + } + if (_aio_hash != NULL) { + (void) munmap((void *)_aio_hash, HASHSZ * sizeof (aio_hash_t)); + _aio_hash = NULL; + } + for (chp = chunk_list; chp != NULL; chp = chunk_list) { + chunk_list = chp->chunk_next; + (void) munmap((void *)chp, chp->chunk_size); + } + + /* + * Reinitialize global variables + */ + + worker_freelist = NULL; + worker_freelast = NULL; + worker_chunksize = 0; + (void) mutex_init(&worker_lock, USYNC_THREAD, NULL); + + _aio_freelist = NULL; + _aio_freelast = NULL; + request_chunksize = 0; + _aio_freelist_cnt = 0; + _aio_allocated_cnt = 0; + (void) mutex_init(&__aio_cache_lock, USYNC_THREAD, NULL); + + _lio_head_freelist = NULL; + _lio_head_freelast = NULL; + lio_head_chunksize = 0; + _lio_alloc = 0; + _lio_free = 0; + (void) mutex_init(&__lio_mutex, USYNC_THREAD, NULL); + + (void) mutex_init(&__aio_initlock, USYNC_THREAD, NULL); + (void) cond_init(&__aio_initcv, USYNC_THREAD, NULL); + __aio_initbusy = 0; + + (void) mutex_init(&__aio_mutex, USYNC_THREAD, NULL); + (void) cond_init(&_aio_iowait_cv, USYNC_THREAD, NULL); + (void) cond_init(&_aio_waitn_cv, USYNC_THREAD, NULL); + + _kaio_ok = 0; + __uaio_ok = 0; + + _kaiowp = NULL; + + __workers_rw = NULL; + __nextworker_rw = NULL; + __rw_workerscnt = 0; + + __workers_no = NULL; + __nextworker_no = NULL; + __no_workerscnt = 0; + + _aio_worker_cnt = 0; + + _aio_done_head = NULL; + _aio_done_tail = NULL; + _aio_donecnt = 0; + + _aio_doneq = NULL; + _aio_doneq_cnt = 0; + + _aio_waitncnt = 0; + _aio_outstand_cnt = 0; + _kaio_outstand_cnt = 0; + _aio_req_done_cnt = 0; + _aio_kernel_suspend = 0; + _aio_suscv_cnt = 0; + + _aiowait_flag = 0; + _aio_flags = 0; +} + +#define DISPLAY(var) \ + (void) fprintf(stderr, #var "\t= %d\n", var) + +static void +_aio_exit_info(void) +{ + if ((_kaio_ok | __uaio_ok) == 0) + return; + (void) fprintf(stderr, "\n"); + DISPLAY(_aio_freelist_cnt); + DISPLAY(_aio_allocated_cnt); + DISPLAY(_lio_alloc); + DISPLAY(_lio_free); + DISPLAY(__rw_workerscnt); + DISPLAY(__no_workerscnt); + DISPLAY(_aio_worker_cnt); + DISPLAY(_aio_donecnt); + DISPLAY(_aio_doneq_cnt); + DISPLAY(_aio_waitncnt); + DISPLAY(_aio_outstand_cnt); + DISPLAY(_kaio_outstand_cnt); + DISPLAY(_aio_req_done_cnt); + DISPLAY(_aio_kernel_suspend); + DISPLAY(_aio_suscv_cnt); + DISPLAY(_aiowait_flag); + DISPLAY(_aio_flags); +} + +void +init_aio(void) +{ + char *str; + + (void) pthread_key_create(&_aio_key, _aio_worker_free); + if ((str = getenv("_AIO_MIN_WORKERS")) != NULL) { + if ((_min_workers = atoi(str)) <= 0) + _min_workers = 4; + } + if ((str = getenv("_AIO_MAX_WORKERS")) != NULL) { + if ((_max_workers = atoi(str)) <= 0) + _max_workers = 256; + if (_max_workers < _min_workers + 1) + _max_workers = _min_workers + 1; + } + if ((str = getenv("_AIO_EXIT_INFO")) != NULL && atoi(str) != 0) + (void) atexit(_aio_exit_info); +} diff --git a/usr/src/lib/libc/port/aio/posix_aio.c b/usr/src/lib/libc/port/aio/posix_aio.c new file mode 100644 index 0000000000..5e3c3ac41d --- /dev/null +++ b/usr/src/lib/libc/port/aio/posix_aio.c @@ -0,0 +1,1758 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * posix_aio.c implements the POSIX async. I/O functions. + * + * aio_read + * aio_write + * aio_error + * aio_return + * aio_suspend + * lio_listio + * aio_fsync + * aio_cancel + */ + +#include "synonyms.h" +#include "thr_uberdata.h" +#include "asyncio.h" +#include <atomic.h> +#include <sys/file.h> +#include <sys/port.h> + +extern int __fdsync(int, int); + +cond_t _aio_waitn_cv = DEFAULTCV; /* wait for end of aio_waitn */ + +static int _aio_check_timeout(const timespec_t *, timespec_t *, int *); + +/* defines for timedwait in __aio_waitn() and __aio_suspend() */ +#define AIO_TIMEOUT_INDEF -1 +#define AIO_TIMEOUT_POLL 0 +#define AIO_TIMEOUT_WAIT 1 +#define AIO_TIMEOUT_UNDEF 2 + +/* + * List I/O stuff + */ +static void _lio_list_decr(aio_lio_t *); +static long aio_list_max = 0; + +int +aio_read(aiocb_t *aiocbp) +{ + if (aiocbp == NULL || aiocbp->aio_reqprio < 0) { + errno = EINVAL; + return (-1); + } + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (_aio_sigev_thread(aiocbp) != 0) + return (-1); + aiocbp->aio_lio_opcode = LIO_READ; + return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAREAD, + (AIO_KAIO | AIO_NO_DUPS))); +} + +int +aio_write(aiocb_t *aiocbp) +{ + if (aiocbp == NULL || aiocbp->aio_reqprio < 0) { + errno = EINVAL; + return (-1); + } + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (_aio_sigev_thread(aiocbp) != 0) + return (-1); + aiocbp->aio_lio_opcode = LIO_WRITE; + return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAWRITE, + (AIO_KAIO | AIO_NO_DUPS))); +} + +/* + * __lio_listio() cancellation handler. + */ +/* ARGSUSED */ +static void +_lio_listio_cleanup(aio_lio_t *head) +{ + int freeit = 0; + + ASSERT(MUTEX_HELD(&head->lio_mutex)); + if (head->lio_refcnt == 0) { + ASSERT(head->lio_nent == 0); + freeit = 1; + } + head->lio_waiting = 0; + sig_mutex_unlock(&head->lio_mutex); + if (freeit) + _aio_lio_free(head); +} + +int +lio_listio(int mode, aiocb_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list, + int nent, struct sigevent *_RESTRICT_KYWD sigevp) +{ + int aio_ufs = 0; + int oerrno = 0; + aio_lio_t *head = NULL; + aiocb_t *aiocbp; + int state = 0; + int EIOflg = 0; + int rw; + int do_kaio = 0; + int error; + int i; + + if (!_kaio_ok) + _kaio_init(); + + if (aio_list_max == 0) + aio_list_max = sysconf(_SC_AIO_LISTIO_MAX); + + if (nent <= 0 || nent > aio_list_max) { + errno = EINVAL; + return (-1); + } + + switch (mode) { + case LIO_WAIT: + state = NOCHECK; + break; + case LIO_NOWAIT: + state = CHECK; + break; + default: + errno = EINVAL; + return (-1); + } + + for (i = 0; i < nent; i++) { + if ((aiocbp = list[i]) == NULL) + continue; + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (_aio_sigev_thread(aiocbp) != 0) + return (-1); + if (aiocbp->aio_lio_opcode == LIO_NOP) + aiocbp->aio_state = NOCHECK; + else { + aiocbp->aio_state = state; + if (KAIO_SUPPORTED(aiocbp->aio_fildes)) + do_kaio++; + else + aiocbp->aio_resultp.aio_errno = ENOTSUP; + } + } + if (_aio_sigev_thread_init(sigevp) != 0) + return (-1); + + if (do_kaio) { + error = (int)_kaio(AIOLIO, mode, list, nent, sigevp); + if (error == 0) + return (0); + oerrno = errno; + } else { + oerrno = errno = ENOTSUP; + error = -1; + } + + if (error == -1 && errno == ENOTSUP) { + error = errno = 0; + /* + * If LIO_WAIT, or notification required, allocate a list head. + */ + if (mode == LIO_WAIT || + (sigevp != NULL && + (sigevp->sigev_notify == SIGEV_SIGNAL || + sigevp->sigev_notify == SIGEV_THREAD || + sigevp->sigev_notify == SIGEV_PORT))) + head = _aio_lio_alloc(); + if (head) { + sig_mutex_lock(&head->lio_mutex); + head->lio_mode = mode; + head->lio_largefile = 0; + if (mode == LIO_NOWAIT && sigevp != NULL) { + if (sigevp->sigev_notify == SIGEV_THREAD) { + head->lio_port = sigevp->sigev_signo; + head->lio_event = AIOLIO; + head->lio_sigevent = sigevp; + head->lio_sigval.sival_ptr = + sigevp->sigev_value.sival_ptr; + } else if (sigevp->sigev_notify == SIGEV_PORT) { + port_notify_t *pn = + sigevp->sigev_value.sival_ptr; + head->lio_port = pn->portnfy_port; + head->lio_event = AIOLIO; + head->lio_sigevent = sigevp; + head->lio_sigval.sival_ptr = + pn->portnfy_user; + } else { /* SIGEV_SIGNAL */ + head->lio_signo = sigevp->sigev_signo; + head->lio_sigval.sival_ptr = + sigevp->sigev_value.sival_ptr; + } + } + head->lio_nent = head->lio_refcnt = nent; + sig_mutex_unlock(&head->lio_mutex); + } + /* + * find UFS requests, errno == ENOTSUP/EBADFD, + */ + for (i = 0; i < nent; i++) { + if ((aiocbp = list[i]) == NULL || + aiocbp->aio_lio_opcode == LIO_NOP || + (aiocbp->aio_resultp.aio_errno != ENOTSUP && + aiocbp->aio_resultp.aio_errno != EBADFD)) { + if (head) + _lio_list_decr(head); + continue; + } + if (aiocbp->aio_resultp.aio_errno == EBADFD) + SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); + if (aiocbp->aio_reqprio < 0) { + aiocbp->aio_resultp.aio_errno = EINVAL; + aiocbp->aio_resultp.aio_return = -1; + EIOflg = 1; + if (head) + _lio_list_decr(head); + continue; + } + /* + * submit an AIO request with flags AIO_NO_KAIO + * to avoid the kaio() syscall in _aio_rw() + */ + switch (aiocbp->aio_lio_opcode) { + case LIO_READ: + rw = AIOAREAD; + break; + case LIO_WRITE: + rw = AIOAWRITE; + break; + } + error = _aio_rw(aiocbp, head, &__nextworker_rw, rw, + (AIO_NO_KAIO | AIO_NO_DUPS)); + if (error == 0) + aio_ufs++; + else { + if (head) + _lio_list_decr(head); + aiocbp->aio_resultp.aio_errno = error; + EIOflg = 1; + } + } + } + if (EIOflg) { + errno = EIO; + return (-1); + } + if (mode == LIO_WAIT && oerrno == ENOTSUP) { + /* + * call kaio(AIOLIOWAIT) to get all outstanding + * kernel AIO requests + */ + if ((nent - aio_ufs) > 0) + (void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp); + if (head != NULL && head->lio_nent > 0) { + sig_mutex_lock(&head->lio_mutex); + while (head->lio_refcnt > 0) { + int err; + head->lio_waiting = 1; + pthread_cleanup_push(_lio_listio_cleanup, head); + err = sig_cond_wait(&head->lio_cond_cv, + &head->lio_mutex); + pthread_cleanup_pop(0); + head->lio_waiting = 0; + if (err && head->lio_nent > 0) { + sig_mutex_unlock(&head->lio_mutex); + errno = err; + return (-1); + } + } + sig_mutex_unlock(&head->lio_mutex); + ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0); + _aio_lio_free(head); + for (i = 0; i < nent; i++) { + if ((aiocbp = list[i]) != NULL && + aiocbp->aio_resultp.aio_errno) { + errno = EIO; + return (-1); + } + } + } + return (0); + } + return (error); +} + +static void +_lio_list_decr(aio_lio_t *head) +{ + sig_mutex_lock(&head->lio_mutex); + head->lio_nent--; + head->lio_refcnt--; + sig_mutex_unlock(&head->lio_mutex); +} + +/* + * __aio_suspend() cancellation handler. + */ +/* ARGSUSED */ +static void +_aio_suspend_cleanup(int *counter) +{ + ASSERT(MUTEX_HELD(&__aio_mutex)); + (*counter)--; /* _aio_kernel_suspend or _aio_suscv_cnt */ + sig_mutex_unlock(&__aio_mutex); +} + +static int +__aio_suspend(void **list, int nent, const timespec_t *timo, int largefile) +{ + int cv_err; /* error code from cond_xxx() */ + int kerr; /* error code from _kaio(AIOSUSPEND) */ + int i; + timespec_t twait; /* copy of timo for internal calculations */ + timespec_t *wait = NULL; + int timedwait; + int req_outstanding; + aiocb_t **listp; + aiocb_t *aiocbp; +#if !defined(_LP64) + aiocb64_t **listp64; + aiocb64_t *aiocbp64; +#endif + hrtime_t hrtstart; + hrtime_t hrtend; + hrtime_t hrtres; + +#if defined(_LP64) + if (largefile) + aio_panic("__aio_suspend: largefile set when _LP64 defined"); +#endif + + if (nent <= 0) { + errno = EINVAL; + return (-1); + } + + if (timo) { + if (timo->tv_sec < 0 || timo->tv_nsec < 0 || + timo->tv_nsec >= NANOSEC) { + errno = EINVAL; + return (-1); + } + /* Initialize start time if time monitoring desired */ + if (timo->tv_sec > 0 || timo->tv_nsec > 0) { + timedwait = AIO_TIMEOUT_WAIT; + hrtstart = gethrtime(); + } else { + /* content of timeout = 0 : polling */ + timedwait = AIO_TIMEOUT_POLL; + } + } else { + /* timeout pointer = NULL : wait indefinitely */ + timedwait = AIO_TIMEOUT_INDEF; + } + +#if !defined(_LP64) + if (largefile) { + listp64 = (aiocb64_t **)list; + for (i = 0; i < nent; i++) { + if ((aiocbp64 = listp64[i]) != NULL && + aiocbp64->aio_state == CHECK) + aiocbp64->aio_state = CHECKED; + } + } else +#endif /* !_LP64 */ + { + listp = (aiocb_t **)list; + for (i = 0; i < nent; i++) { + if ((aiocbp = listp[i]) != NULL && + aiocbp->aio_state == CHECK) + aiocbp->aio_state = CHECKED; + } + } + + sig_mutex_lock(&__aio_mutex); + + /* + * The next "if -case" is required to accelerate the + * access to completed RAW-IO requests. + */ + if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) { + /* Only kernel requests pending */ + + /* + * _aio_kernel_suspend is used to detect completed non RAW-IO + * requests. + * As long as this thread resides in the kernel (_kaio) further + * asynchronous non RAW-IO requests could be submitted. + */ + _aio_kernel_suspend++; + + /* + * Always do the kaio() call without using the KAIO_SUPPORTED() + * checks because it is not mandatory to have a valid fd + * set in the list entries, only the resultp must be set. + * + * _kaio(AIOSUSPEND ...) return values : + * 0: everythink ok, completed request found + * -1: error + * 1: no error : _aiodone awaked the _kaio(AIOSUSPEND,,) + * system call using _kaio(AIONOTIFY). It means, that some + * non RAW-IOs completed inbetween. + */ + + pthread_cleanup_push(_aio_suspend_cleanup, + &_aio_kernel_suspend); + pthread_cleanup_push(sig_mutex_lock, &__aio_mutex); + sig_mutex_unlock(&__aio_mutex); + _cancel_prologue(); + kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND, + list, nent, timo, -1); + _cancel_epilogue(); + pthread_cleanup_pop(1); /* sig_mutex_lock(&__aio_mutex) */ + pthread_cleanup_pop(0); + + _aio_kernel_suspend--; + + if (!kerr) { + sig_mutex_unlock(&__aio_mutex); + return (0); + } + } else { + kerr = 1; /* simulation: _kaio detected AIONOTIFY */ + } + + /* + * Return kernel error code if no other IOs are outstanding. + */ + req_outstanding = _aio_doneq_cnt + _aio_outstand_cnt; + + sig_mutex_unlock(&__aio_mutex); + + if (req_outstanding == 0) { + /* no IOs outstanding in the thread pool */ + if (kerr == 1) + /* return "no IOs completed" */ + errno = EAGAIN; + return (-1); + } + + /* + * IOs using the thread pool are outstanding. + */ + if (timedwait == AIO_TIMEOUT_WAIT) { + /* time monitoring */ + hrtend = hrtstart + (hrtime_t)timo->tv_sec * (hrtime_t)NANOSEC + + (hrtime_t)timo->tv_nsec; + hrtres = hrtend - gethrtime(); + if (hrtres <= 0) + hrtres = 1; + twait.tv_sec = hrtres / (hrtime_t)NANOSEC; + twait.tv_nsec = hrtres % (hrtime_t)NANOSEC; + wait = &twait; + } else if (timedwait == AIO_TIMEOUT_POLL) { + twait = *timo; /* content of timo = 0 : polling */ + wait = &twait; + } + + for (;;) { + int error; + int inprogress; + + /* first scan file system requests */ + inprogress = 0; + for (i = 0; i < nent; i++) { +#if !defined(_LP64) + if (largefile) { + if ((aiocbp64 = listp64[i]) == NULL) + continue; + error = aiocbp64->aio_resultp.aio_errno; + } else +#endif + { + if ((aiocbp = listp[i]) == NULL) + continue; + error = aiocbp->aio_resultp.aio_errno; + } + if (error == EINPROGRESS) + inprogress = 1; + else if (error != ECANCELED) { + errno = 0; + return (0); + } + } + + sig_mutex_lock(&__aio_mutex); + + /* + * If there aren't outstanding I/Os in the thread pool then + * we have to return here, provided that all kernel RAW-IOs + * also completed. + * If the kernel was notified to return, then we have to check + * possible pending RAW-IOs. + */ + if (_aio_outstand_cnt == 0 && inprogress == 0 && kerr != 1) { + sig_mutex_unlock(&__aio_mutex); + errno = EAGAIN; + break; + } + + /* + * There are outstanding IOs in the thread pool or the kernel + * was notified to return. + * Check pending RAW-IOs first. + */ + if (kerr == 1) { + /* + * _aiodone just notified the kernel about + * completed non RAW-IOs (AIONOTIFY was detected). + */ + if (timedwait == AIO_TIMEOUT_WAIT) { + /* Update remaining timeout for the kernel */ + hrtres = hrtend - gethrtime(); + if (hrtres <= 0) { + /* timer expired */ + sig_mutex_unlock(&__aio_mutex); + errno = EAGAIN; + break; + } + wait->tv_sec = hrtres / (hrtime_t)NANOSEC; + wait->tv_nsec = hrtres % (hrtime_t)NANOSEC; + } + _aio_kernel_suspend++; + + pthread_cleanup_push(_aio_suspend_cleanup, + &_aio_kernel_suspend); + pthread_cleanup_push(sig_mutex_lock, &__aio_mutex); + sig_mutex_unlock(&__aio_mutex); + _cancel_prologue(); + kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND, + list, nent, wait, -1); + _cancel_epilogue(); + pthread_cleanup_pop(1); + pthread_cleanup_pop(0); + + _aio_kernel_suspend--; + + if (!kerr) { + sig_mutex_unlock(&__aio_mutex); + return (0); + } + } + + if (timedwait == AIO_TIMEOUT_POLL) { + sig_mutex_unlock(&__aio_mutex); + errno = EAGAIN; + break; + } + + if (timedwait == AIO_TIMEOUT_WAIT) { + /* Update remaining timeout */ + hrtres = hrtend - gethrtime(); + if (hrtres <= 0) { + /* timer expired */ + sig_mutex_unlock(&__aio_mutex); + errno = EAGAIN; + break; + } + wait->tv_sec = hrtres / (hrtime_t)NANOSEC; + wait->tv_nsec = hrtres % (hrtime_t)NANOSEC; + } + + if (_aio_outstand_cnt == 0) { + sig_mutex_unlock(&__aio_mutex); + continue; + } + + _aio_suscv_cnt++; /* ID for _aiodone (wake up) */ + + pthread_cleanup_push(_aio_suspend_cleanup, &_aio_suscv_cnt); + if (timedwait == AIO_TIMEOUT_WAIT) { + cv_err = sig_cond_reltimedwait(&_aio_iowait_cv, + &__aio_mutex, wait); + if (cv_err == ETIME) + cv_err = EAGAIN; + } else { + /* wait indefinitely */ + cv_err = sig_cond_wait(&_aio_iowait_cv, &__aio_mutex); + } + /* this decrements _aio_suscv_cnt and drops __aio_mutex */ + pthread_cleanup_pop(1); + + if (cv_err) { + errno = cv_err; + break; + } + } + return (-1); +} + +int +aio_suspend(const aiocb_t * const list[], int nent, + const timespec_t *timeout) +{ + return (__aio_suspend((void **)list, nent, timeout, 0)); +} + +int +aio_error(const aiocb_t *aiocbp) +{ + const aio_result_t *resultp = &aiocbp->aio_resultp; + int error; + + if ((error = resultp->aio_errno) == EINPROGRESS) { + if (aiocbp->aio_state == CHECK) { + /* + * Always do the kaio() call without using the + * KAIO_SUPPORTED() checks because it is not + * mandatory to have a valid fd set in the + * aiocb, only the resultp must be set. + */ + if ((int)_kaio(AIOERROR, aiocbp) == EINVAL) { + errno = EINVAL; + return (-1); + } + error = resultp->aio_errno; + } else if (aiocbp->aio_state == CHECKED) { + ((aiocb_t *)aiocbp)->aio_state = CHECK; + } + } + return (error); +} + +ssize_t +aio_return(aiocb_t *aiocbp) +{ + aio_result_t *resultp = &aiocbp->aio_resultp; + aio_req_t *reqp; + int error; + ssize_t retval; + + /* + * The _aiodone() function stores resultp->aio_return before + * storing resultp->aio_errno (with an membar_producer() in + * between). We use membar_consumer() below to ensure proper + * memory ordering between _aiodone() and ourself. + */ + error = resultp->aio_errno; + membar_consumer(); + retval = resultp->aio_return; + + /* + * we use this condition to indicate either that + * aio_return() has been called before or should + * not have been called yet. + */ + if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) { + errno = error; + return (-1); + } + + /* + * Before we return, mark the result as being returned so that later + * calls to aio_return() will return the fact that the result has + * already been returned. + */ + sig_mutex_lock(&__aio_mutex); + /* retest, in case more than one thread actually got in here */ + if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) { + sig_mutex_unlock(&__aio_mutex); + errno = EINVAL; + return (-1); + } + resultp->aio_return = -1; + resultp->aio_errno = EINVAL; + if ((reqp = _aio_hash_del(resultp)) == NULL) + sig_mutex_unlock(&__aio_mutex); + else { + aiocbp->aio_state = NOCHECK; + ASSERT(reqp->req_head == NULL); + (void) _aio_req_remove(reqp); + sig_mutex_unlock(&__aio_mutex); + _aio_req_free(reqp); + } + + if (retval == -1) + errno = error; + return (retval); +} + +void +_lio_remove(aio_req_t *reqp) +{ + aio_lio_t *head; + int refcnt; + + if ((head = reqp->req_head) != NULL) { + sig_mutex_lock(&head->lio_mutex); + ASSERT(head->lio_refcnt == head->lio_nent); + refcnt = --head->lio_nent; + head->lio_refcnt--; + sig_mutex_unlock(&head->lio_mutex); + if (refcnt == 0) + _aio_lio_free(head); + reqp->req_head = NULL; + } +} + +/* + * This function returns the number of asynchronous I/O requests submitted. + */ +static int +__aio_fsync_bar(aiocb_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp, + int workerscnt) +{ + int i; + int error; + aio_worker_t *next = aiowp; + + for (i = 0; i < workerscnt; i++) { + error = _aio_rw(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO); + if (error != 0) { + sig_mutex_lock(&head->lio_mutex); + head->lio_mode = LIO_DESTROY; /* ignore fsync */ + head->lio_nent -= workerscnt - i; + head->lio_refcnt -= workerscnt - i; + sig_mutex_unlock(&head->lio_mutex); + errno = EAGAIN; + return (i); + } + next = next->work_forw; + } + return (i); +} + +int +aio_fsync(int op, aiocb_t *aiocbp) +{ + aio_lio_t *head; + struct stat statb; + int fret; + + if (aiocbp == NULL) + return (0); + if (aiocbp->aio_reqprio < 0 || (op != O_DSYNC && op != O_SYNC)) { + errno = EINVAL; + return (-1); + } + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (fstat(aiocbp->aio_fildes, &statb) < 0) + return (-1); + if (_aio_sigev_thread(aiocbp) != 0) + return (-1); + + /* + * Kernel aio_fsync() is not supported. + * We force user-level aio_fsync() just + * for the notification side-effect. + */ + if (!__uaio_ok && __uaio_init() == -1) + return (-1); + + /* + * The first asynchronous I/O request in the current process will + * create a bunch of workers (via __uaio_init()). If the number + * of workers is zero then the number of pending asynchronous I/O + * requests is zero. In such a case only execute the standard + * fsync(3C) or fdatasync(3RT) as appropriate. + */ + if (__rw_workerscnt == 0) { + if (op == O_DSYNC) + return (__fdsync(aiocbp->aio_fildes, FDSYNC)); + else + return (__fdsync(aiocbp->aio_fildes, FSYNC)); + } + + /* + * re-use aio_offset as the op field. + * O_DSYNC - fdatasync() + * O_SYNC - fsync() + */ + aiocbp->aio_offset = op; + aiocbp->aio_lio_opcode = AIOFSYNC; + + /* + * Create a list of fsync requests. The worker that + * gets the last request will do the fsync request. + */ + head = _aio_lio_alloc(); + if (head == NULL) { + errno = EAGAIN; + return (-1); + } + head->lio_mode = LIO_FSYNC; + head->lio_nent = head->lio_refcnt = __rw_workerscnt; + head->lio_largefile = 0; + + /* + * Insert an fsync request on every worker's queue. + */ + fret = __aio_fsync_bar(aiocbp, head, __workers_rw, __rw_workerscnt); + if (fret != __rw_workerscnt) { + /* + * Fewer fsync requests than workers means that it was + * not possible to submit fsync requests to all workers. + * Actions: + * a) number of fsync requests submitted is 0: + * => free allocated memory (aio_lio_t). + * b) number of fsync requests submitted is > 0: + * => the last worker executing the fsync request + * will free the aio_lio_t struct. + */ + if (fret == 0) + _aio_lio_free(head); + return (-1); + } + return (0); +} + +int +aio_cancel(int fd, aiocb_t *aiocbp) +{ + aio_req_t *reqp; + aio_worker_t *aiowp; + int done = 0; + int canceled = 0; + struct stat buf; + + if (fstat(fd, &buf) < 0) + return (-1); + + if (aiocbp != NULL) { + if (fd != aiocbp->aio_fildes) { + errno = EINVAL; + return (-1); + } + if (aiocbp->aio_state == USERAIO) { + sig_mutex_lock(&__aio_mutex); + reqp = _aio_hash_find(&aiocbp->aio_resultp); + if (reqp == NULL) { + sig_mutex_unlock(&__aio_mutex); + return (AIO_ALLDONE); + } + aiowp = reqp->req_worker; + sig_mutex_lock(&aiowp->work_qlock1); + (void) _aio_cancel_req(aiowp, reqp, &canceled, &done); + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_unlock(&__aio_mutex); + if (done) + return (AIO_ALLDONE); + if (canceled) + return (AIO_CANCELED); + return (AIO_NOTCANCELED); + } + if (aiocbp->aio_state == USERAIO_DONE) + return (AIO_ALLDONE); + return ((int)_kaio(AIOCANCEL, fd, aiocbp)); + } + + return (aiocancel_all(fd)); +} + +/* + * __aio_waitn() cancellation handler. + */ +/* ARGSUSED */ +static void +_aio_waitn_cleanup(void *arg) +{ + ASSERT(MUTEX_HELD(&__aio_mutex)); + + /* check for pending aio_waitn() calls */ + _aio_flags &= ~(AIO_LIB_WAITN | AIO_WAIT_INPROGRESS | AIO_IO_WAITING); + if (_aio_flags & AIO_LIB_WAITN_PENDING) { + _aio_flags &= ~AIO_LIB_WAITN_PENDING; + (void) cond_signal(&_aio_waitn_cv); + } + + sig_mutex_unlock(&__aio_mutex); +} + +/* + * aio_waitn can be used to reap the results of several I/O operations that + * were submitted asynchronously. The submission of I/Os can be done using + * existing POSIX interfaces: lio_listio, aio_write or aio_read. + * aio_waitn waits until "nwait" I/Os (supplied as a parameter) have + * completed and it returns the descriptors for these I/Os in "list". The + * maximum size of this list is given by "nent" and the actual number of I/Os + * completed is returned in "nwait". Otherwise aio_waitn might also + * return if the timeout expires. Additionally, aio_waitn returns 0 if + * successful or -1 if an error occurred. + */ +static int +__aio_waitn(void **list, uint_t nent, uint_t *nwait, const timespec_t *utimo) +{ + int error = 0; + uint_t dnwait = 0; /* amount of requests in the waitn-done list */ + uint_t kwaitcnt; /* expected "done" requests from kernel */ + uint_t knentcnt; /* max. expected "done" requests from kernel */ + int uerrno = 0; + int kerrno = 0; /* save errno from _kaio() call */ + int timedwait = AIO_TIMEOUT_UNDEF; + aio_req_t *reqp; + timespec_t end; + timespec_t twait; /* copy of utimo for internal calculations */ + timespec_t *wait = NULL; + + if (nent == 0 || *nwait == 0 || *nwait > nent) { + errno = EINVAL; + return (-1); + } + + /* + * Only one running aio_waitn call per process allowed. + * Further calls will be blocked here until the running + * call finishes. + */ + + sig_mutex_lock(&__aio_mutex); + + while (_aio_flags & AIO_LIB_WAITN) { + if (utimo && utimo->tv_sec == 0 && utimo->tv_nsec == 0) { + sig_mutex_unlock(&__aio_mutex); + *nwait = 0; + return (0); + } + _aio_flags |= AIO_LIB_WAITN_PENDING; + pthread_cleanup_push(sig_mutex_unlock, &__aio_mutex); + error = sig_cond_wait(&_aio_waitn_cv, &__aio_mutex); + pthread_cleanup_pop(0); + if (error != 0) { + sig_mutex_unlock(&__aio_mutex); + *nwait = 0; + errno = error; + return (-1); + } + } + + pthread_cleanup_push(_aio_waitn_cleanup, NULL); + + _aio_flags |= AIO_LIB_WAITN; + + if (*nwait >= AIO_WAITN_MAXIOCBS) { + if (_aio_check_timeout(utimo, &end, &timedwait) != 0) { + error = -1; + dnwait = 0; + goto out; + } + if (timedwait != AIO_TIMEOUT_INDEF) { + twait = *utimo; + wait = &twait; + } + } + + /* + * If both counters are still set to zero, then only + * kernel requests are currently outstanding (raw-I/Os). + */ + if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) { + for (;;) { + kwaitcnt = *nwait - dnwait; + knentcnt = nent - dnwait; + if (knentcnt > AIO_WAITN_MAXIOCBS) + knentcnt = AIO_WAITN_MAXIOCBS; + kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt; + + pthread_cleanup_push(sig_mutex_lock, &__aio_mutex); + sig_mutex_unlock(&__aio_mutex); + _cancel_prologue(); + error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt, + &kwaitcnt, wait); + _cancel_epilogue(); + pthread_cleanup_pop(1); + + if (error == 0) { + dnwait += kwaitcnt; + if (dnwait >= *nwait || + *nwait < AIO_WAITN_MAXIOCBS) + break; + if (timedwait == AIO_TIMEOUT_WAIT) { + error = _aio_get_timedelta(&end, wait); + if (error == -1) { + /* timer expired */ + errno = ETIME; + break; + } + } + continue; + } + if (errno == EAGAIN) { + if (dnwait > 0) + error = 0; + break; + } + if (errno == ETIME || errno == EINTR) { + dnwait += kwaitcnt; + break; + } + /* fatal error */ + break; + } + + goto out; + } + + /* File system I/Os outstanding ... */ + + if (timedwait == AIO_TIMEOUT_UNDEF) { + if (_aio_check_timeout(utimo, &end, &timedwait) != 0) { + error = -1; + dnwait = 0; + goto out; + } + if (timedwait != AIO_TIMEOUT_INDEF) { + twait = *utimo; + wait = &twait; + } + } + + for (;;) { + uint_t sum_reqs; + + /* + * Calculate sum of active non RAW-IO requests (sum_reqs). + * If the expected amount of completed requests (*nwait) is + * greater than the calculated sum (sum_reqs) then + * use _kaio to check pending RAW-IO requests. + */ + sum_reqs = _aio_doneq_cnt + dnwait + _aio_outstand_cnt; + kwaitcnt = (*nwait > sum_reqs) ? *nwait - sum_reqs : 0; + + if (kwaitcnt != 0) { + /* possibly some kernel I/Os outstanding */ + knentcnt = nent - dnwait; + if (knentcnt > AIO_WAITN_MAXIOCBS) + knentcnt = AIO_WAITN_MAXIOCBS; + kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt; + + _aio_flags |= AIO_WAIT_INPROGRESS; + + pthread_cleanup_push(sig_mutex_lock, &__aio_mutex); + sig_mutex_unlock(&__aio_mutex); + _cancel_prologue(); + error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt, + &kwaitcnt, wait); + _cancel_epilogue(); + pthread_cleanup_pop(1); + + _aio_flags &= ~AIO_WAIT_INPROGRESS; + + if (error == 0) { + dnwait += kwaitcnt; + } else { + switch (errno) { + case EINVAL: + case EAGAIN: + /* don't wait for kernel I/Os */ + kerrno = 0; /* ignore _kaio() errno */ + *nwait = _aio_doneq_cnt + + _aio_outstand_cnt + dnwait; + error = 0; + break; + case EINTR: + case ETIME: + /* just scan for completed LIB I/Os */ + dnwait += kwaitcnt; + timedwait = AIO_TIMEOUT_POLL; + kerrno = errno; /* save _kaio() errno */ + error = 0; + break; + default: + kerrno = errno; /* save _kaio() errno */ + break; + } + } + if (error) + break; /* fatal kernel error */ + } + + /* check completed FS requests in the "done" queue */ + + while (_aio_doneq_cnt && dnwait < nent) { + /* get done requests */ + if ((reqp = _aio_req_remove(NULL)) != NULL) { + (void) _aio_hash_del(reqp->req_resultp); + list[dnwait++] = reqp->req_aiocbp; + _aio_req_mark_done(reqp); + _lio_remove(reqp); + _aio_req_free(reqp); + } + } + + if (dnwait >= *nwait) { + /* min. requested amount of completed I/Os satisfied */ + break; + } + if (timedwait == AIO_TIMEOUT_WAIT && + (error = _aio_get_timedelta(&end, wait)) == -1) { + /* timer expired */ + uerrno = ETIME; + break; + } + + /* + * If some I/Os are outstanding and we have to wait for them, + * then sleep here. _aiodone() will call _aio_waitn_wakeup() + * to wakeup this thread as soon as the required amount of + * completed I/Os is done. + */ + if (_aio_outstand_cnt > 0 && timedwait != AIO_TIMEOUT_POLL) { + /* + * _aio_waitn_wakeup() will wake up this thread when: + * - _aio_waitncnt requests are completed or + * - _aio_outstand_cnt becomes zero. + * sig_cond_reltimedwait() could also return with + * a timeout error (ETIME). + */ + if (*nwait < _aio_outstand_cnt) + _aio_waitncnt = *nwait; + else + _aio_waitncnt = _aio_outstand_cnt; + + _aio_flags |= AIO_IO_WAITING; + + if (wait) + uerrno = sig_cond_reltimedwait(&_aio_iowait_cv, + &__aio_mutex, wait); + else + uerrno = sig_cond_wait(&_aio_iowait_cv, + &__aio_mutex); + + _aio_flags &= ~AIO_IO_WAITING; + + if (uerrno == ETIME) { + timedwait = AIO_TIMEOUT_POLL; + continue; + } + if (uerrno != 0) + timedwait = AIO_TIMEOUT_POLL; + } + + if (timedwait == AIO_TIMEOUT_POLL) { + /* polling or timer expired */ + break; + } + } + + errno = uerrno == 0 ? kerrno : uerrno; + if (errno) + error = -1; + else + error = 0; + +out: + *nwait = dnwait; + + pthread_cleanup_pop(1); /* drops __aio_mutex */ + + return (error); +} + +int +aio_waitn(aiocb_t *list[], uint_t nent, uint_t *nwait, + const timespec_t *timeout) +{ + return (__aio_waitn((void **)list, nent, nwait, timeout)); +} + +void +_aio_waitn_wakeup(void) +{ + /* + * __aio_waitn() sets AIO_IO_WAITING to notify _aiodone() that + * it is waiting for completed I/Os. The number of required + * completed I/Os is stored into "_aio_waitncnt". + * aio_waitn() is woken up when + * - there are no further outstanding I/Os + * (_aio_outstand_cnt == 0) or + * - the expected number of I/Os has completed. + * Only one __aio_waitn() function waits for completed I/Os at + * a time. + * + * __aio_suspend() increments "_aio_suscv_cnt" to notify + * _aiodone() that at least one __aio_suspend() call is + * waiting for completed I/Os. + * There could be more than one __aio_suspend() function + * waiting for completed I/Os. Because every function should + * be waiting for different I/Os, _aiodone() has to wake up all + * __aio_suspend() functions each time. + * Every __aio_suspend() function will compare the recently + * completed I/O with its own list. + */ + ASSERT(MUTEX_HELD(&__aio_mutex)); + if (_aio_flags & AIO_IO_WAITING) { + if (_aio_waitncnt > 0) + _aio_waitncnt--; + if (_aio_outstand_cnt == 0 || _aio_waitncnt == 0 || + _aio_suscv_cnt > 0) + (void) cond_broadcast(&_aio_iowait_cv); + } else { + /* Wake up waiting aio_suspend calls */ + if (_aio_suscv_cnt > 0) + (void) cond_broadcast(&_aio_iowait_cv); + } +} + +/* + * timedwait values : + * AIO_TIMEOUT_POLL : polling + * AIO_TIMEOUT_WAIT : timeout + * AIO_TIMEOUT_INDEF : wait indefinitely + */ +static int +_aio_check_timeout(const timespec_t *utimo, timespec_t *end, int *timedwait) +{ + struct timeval curtime; + + if (utimo) { + if (utimo->tv_sec < 0 || utimo->tv_nsec < 0 || + utimo->tv_nsec >= NANOSEC) { + errno = EINVAL; + return (-1); + } + if (utimo->tv_sec > 0 || utimo->tv_nsec > 0) { + (void) gettimeofday(&curtime, NULL); + end->tv_sec = utimo->tv_sec + curtime.tv_sec; + end->tv_nsec = utimo->tv_nsec + 1000 * curtime.tv_usec; + if (end->tv_nsec >= NANOSEC) { + end->tv_nsec -= NANOSEC; + end->tv_sec += 1; + } + *timedwait = AIO_TIMEOUT_WAIT; + } else { + /* polling */ + *timedwait = AIO_TIMEOUT_POLL; + } + } else { + *timedwait = AIO_TIMEOUT_INDEF; /* wait indefinitely */ + } + return (0); +} + +#if !defined(_LP64) + +int +aio_read64(aiocb64_t *aiocbp) +{ + if (aiocbp == NULL || aiocbp->aio_reqprio < 0) { + errno = EINVAL; + return (-1); + } + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (_aio_sigev_thread64(aiocbp) != 0) + return (-1); + aiocbp->aio_lio_opcode = LIO_READ; + return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAREAD64, + (AIO_KAIO | AIO_NO_DUPS))); +} + +int +aio_write64(aiocb64_t *aiocbp) +{ + if (aiocbp == NULL || aiocbp->aio_reqprio < 0) { + errno = EINVAL; + return (-1); + } + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (_aio_sigev_thread64(aiocbp) != 0) + return (-1); + aiocbp->aio_lio_opcode = LIO_WRITE; + return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAWRITE64, + (AIO_KAIO | AIO_NO_DUPS))); +} + +int +lio_listio64(int mode, aiocb64_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list, + int nent, struct sigevent *_RESTRICT_KYWD sigevp) +{ + int aio_ufs = 0; + int oerrno = 0; + aio_lio_t *head = NULL; + aiocb64_t *aiocbp; + int state = 0; + int EIOflg = 0; + int rw; + int do_kaio = 0; + int error; + int i; + + if (!_kaio_ok) + _kaio_init(); + + if (aio_list_max == 0) + aio_list_max = sysconf(_SC_AIO_LISTIO_MAX); + + if (nent <= 0 || nent > aio_list_max) { + errno = EINVAL; + return (-1); + } + + switch (mode) { + case LIO_WAIT: + state = NOCHECK; + break; + case LIO_NOWAIT: + state = CHECK; + break; + default: + errno = EINVAL; + return (-1); + } + + for (i = 0; i < nent; i++) { + if ((aiocbp = list[i]) == NULL) + continue; + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (_aio_sigev_thread64(aiocbp) != 0) + return (-1); + if (aiocbp->aio_lio_opcode == LIO_NOP) + aiocbp->aio_state = NOCHECK; + else { + aiocbp->aio_state = state; + if (KAIO_SUPPORTED(aiocbp->aio_fildes)) + do_kaio++; + else + aiocbp->aio_resultp.aio_errno = ENOTSUP; + } + } + if (_aio_sigev_thread_init(sigevp) != 0) + return (-1); + + if (do_kaio) { + error = (int)_kaio(AIOLIO64, mode, list, nent, sigevp); + if (error == 0) + return (0); + oerrno = errno; + } else { + oerrno = errno = ENOTSUP; + error = -1; + } + + if (error == -1 && errno == ENOTSUP) { + error = errno = 0; + /* + * If LIO_WAIT, or notification required, allocate a list head. + */ + if (mode == LIO_WAIT || + (sigevp != NULL && + (sigevp->sigev_notify == SIGEV_SIGNAL || + sigevp->sigev_notify == SIGEV_THREAD || + sigevp->sigev_notify == SIGEV_PORT))) + head = _aio_lio_alloc(); + if (head) { + sig_mutex_lock(&head->lio_mutex); + head->lio_mode = mode; + head->lio_largefile = 1; + if (mode == LIO_NOWAIT && sigevp != NULL) { + if (sigevp->sigev_notify == SIGEV_THREAD) { + head->lio_port = sigevp->sigev_signo; + head->lio_event = AIOLIO64; + head->lio_sigevent = sigevp; + head->lio_sigval.sival_ptr = + sigevp->sigev_value.sival_ptr; + } else if (sigevp->sigev_notify == SIGEV_PORT) { + port_notify_t *pn = + sigevp->sigev_value.sival_ptr; + head->lio_port = pn->portnfy_port; + head->lio_event = AIOLIO64; + head->lio_sigevent = sigevp; + head->lio_sigval.sival_ptr = + pn->portnfy_user; + } else { /* SIGEV_SIGNAL */ + head->lio_signo = sigevp->sigev_signo; + head->lio_sigval.sival_ptr = + sigevp->sigev_value.sival_ptr; + } + } + head->lio_nent = head->lio_refcnt = nent; + sig_mutex_unlock(&head->lio_mutex); + } + /* + * find UFS requests, errno == ENOTSUP/EBADFD, + */ + for (i = 0; i < nent; i++) { + if ((aiocbp = list[i]) == NULL || + aiocbp->aio_lio_opcode == LIO_NOP || + (aiocbp->aio_resultp.aio_errno != ENOTSUP && + aiocbp->aio_resultp.aio_errno != EBADFD)) { + if (head) + _lio_list_decr(head); + continue; + } + if (aiocbp->aio_resultp.aio_errno == EBADFD) + SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); + if (aiocbp->aio_reqprio < 0) { + aiocbp->aio_resultp.aio_errno = EINVAL; + aiocbp->aio_resultp.aio_return = -1; + EIOflg = 1; + if (head) + _lio_list_decr(head); + continue; + } + /* + * submit an AIO request with flags AIO_NO_KAIO + * to avoid the kaio() syscall in _aio_rw() + */ + switch (aiocbp->aio_lio_opcode) { + case LIO_READ: + rw = AIOAREAD64; + break; + case LIO_WRITE: + rw = AIOAWRITE64; + break; + } + error = _aio_rw64(aiocbp, head, &__nextworker_rw, rw, + (AIO_NO_KAIO | AIO_NO_DUPS)); + if (error == 0) + aio_ufs++; + else { + if (head) + _lio_list_decr(head); + aiocbp->aio_resultp.aio_errno = error; + EIOflg = 1; + } + } + } + if (EIOflg) { + errno = EIO; + return (-1); + } + if (mode == LIO_WAIT && oerrno == ENOTSUP) { + /* + * call kaio(AIOLIOWAIT) to get all outstanding + * kernel AIO requests + */ + if ((nent - aio_ufs) > 0) + (void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp); + if (head != NULL && head->lio_nent > 0) { + sig_mutex_lock(&head->lio_mutex); + while (head->lio_refcnt > 0) { + int err; + head->lio_waiting = 1; + pthread_cleanup_push(_lio_listio_cleanup, head); + err = sig_cond_wait(&head->lio_cond_cv, + &head->lio_mutex); + pthread_cleanup_pop(0); + head->lio_waiting = 0; + if (err && head->lio_nent > 0) { + sig_mutex_unlock(&head->lio_mutex); + errno = err; + return (-1); + } + } + sig_mutex_unlock(&head->lio_mutex); + ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0); + _aio_lio_free(head); + for (i = 0; i < nent; i++) { + if ((aiocbp = list[i]) != NULL && + aiocbp->aio_resultp.aio_errno) { + errno = EIO; + return (-1); + } + } + } + return (0); + } + return (error); +} + +int +aio_suspend64(const aiocb64_t * const list[], int nent, + const timespec_t *timeout) +{ + return (__aio_suspend((void **)list, nent, timeout, 1)); +} + +int +aio_error64(const aiocb64_t *aiocbp) +{ + const aio_result_t *resultp = &aiocbp->aio_resultp; + int error; + + if ((error = resultp->aio_errno) == EINPROGRESS) { + if (aiocbp->aio_state == CHECK) { + /* + * Always do the kaio() call without using the + * KAIO_SUPPORTED() checks because it is not + * mandatory to have a valid fd set in the + * aiocb, only the resultp must be set. + */ + if ((int)_kaio(AIOERROR64, aiocbp) == EINVAL) { + errno = EINVAL; + return (-1); + } + error = resultp->aio_errno; + } else if (aiocbp->aio_state == CHECKED) { + ((aiocb64_t *)aiocbp)->aio_state = CHECK; + } + } + return (error); +} + +ssize_t +aio_return64(aiocb64_t *aiocbp) +{ + aio_result_t *resultp = &aiocbp->aio_resultp; + aio_req_t *reqp; + int error; + ssize_t retval; + + /* + * The _aiodone() function stores resultp->aio_return before + * storing resultp->aio_errno (with an membar_producer() in + * between). We use membar_consumer() below to ensure proper + * memory ordering between _aiodone() and ourself. + */ + error = resultp->aio_errno; + membar_consumer(); + retval = resultp->aio_return; + + /* + * we use this condition to indicate either that + * aio_return() has been called before or should + * not have been called yet. + */ + if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) { + errno = error; + return (-1); + } + + /* + * Before we return, mark the result as being returned so that later + * calls to aio_return() will return the fact that the result has + * already been returned. + */ + sig_mutex_lock(&__aio_mutex); + /* retest, in case more than one thread actually got in here */ + if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) { + sig_mutex_unlock(&__aio_mutex); + errno = EINVAL; + return (-1); + } + resultp->aio_return = -1; + resultp->aio_errno = EINVAL; + if ((reqp = _aio_hash_del(resultp)) == NULL) + sig_mutex_unlock(&__aio_mutex); + else { + aiocbp->aio_state = NOCHECK; + ASSERT(reqp->req_head == NULL); + (void) _aio_req_remove(reqp); + sig_mutex_unlock(&__aio_mutex); + _aio_req_free(reqp); + } + + if (retval == -1) + errno = error; + return (retval); +} + +static int +__aio_fsync_bar64(aiocb64_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp, + int workerscnt) +{ + int i; + int error; + aio_worker_t *next = aiowp; + + for (i = 0; i < workerscnt; i++) { + error = _aio_rw64(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO); + if (error != 0) { + sig_mutex_lock(&head->lio_mutex); + head->lio_mode = LIO_DESTROY; /* ignore fsync */ + head->lio_nent -= workerscnt - i; + head->lio_refcnt -= workerscnt - i; + sig_mutex_unlock(&head->lio_mutex); + errno = EAGAIN; + return (i); + } + next = next->work_forw; + } + return (i); +} + +int +aio_fsync64(int op, aiocb64_t *aiocbp) +{ + aio_lio_t *head; + struct stat statb; + int fret; + + if (aiocbp == NULL) + return (0); + if (aiocbp->aio_reqprio < 0 || (op != O_DSYNC && op != O_SYNC)) { + errno = EINVAL; + return (-1); + } + if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) { + errno = EBUSY; + return (-1); + } + if (fstat(aiocbp->aio_fildes, &statb) < 0) + return (-1); + if (_aio_sigev_thread64(aiocbp) != 0) + return (-1); + + /* + * Kernel aio_fsync() is not supported. + * We force user-level aio_fsync() just + * for the notification side-effect. + */ + if (!__uaio_ok && __uaio_init() == -1) + return (-1); + + /* + * The first asynchronous I/O request in the current process will + * create a bunch of workers (via __uaio_init()). If the number + * of workers is zero then the number of pending asynchronous I/O + * requests is zero. In such a case only execute the standard + * fsync(3C) or fdatasync(3RT) as appropriate. + */ + if (__rw_workerscnt == 0) { + if (op == O_DSYNC) + return (__fdsync(aiocbp->aio_fildes, FDSYNC)); + else + return (__fdsync(aiocbp->aio_fildes, FSYNC)); + } + + /* + * re-use aio_offset as the op field. + * O_DSYNC - fdatasync() + * O_SYNC - fsync() + */ + aiocbp->aio_offset = op; + aiocbp->aio_lio_opcode = AIOFSYNC; + + /* + * Create a list of fsync requests. The worker that + * gets the last request will do the fsync request. + */ + head = _aio_lio_alloc(); + if (head == NULL) { + errno = EAGAIN; + return (-1); + } + head->lio_mode = LIO_FSYNC; + head->lio_nent = head->lio_refcnt = __rw_workerscnt; + head->lio_largefile = 1; + + /* + * Insert an fsync request on every worker's queue. + */ + fret = __aio_fsync_bar64(aiocbp, head, __workers_rw, __rw_workerscnt); + if (fret != __rw_workerscnt) { + /* + * Fewer fsync requests than workers means that it was + * not possible to submit fsync requests to all workers. + * Actions: + * a) number of fsync requests submitted is 0: + * => free allocated memory (aio_lio_t). + * b) number of fsync requests submitted is > 0: + * => the last worker executing the fsync request + * will free the aio_lio_t struct. + */ + if (fret == 0) + _aio_lio_free(head); + return (-1); + } + return (0); +} + +int +aio_cancel64(int fd, aiocb64_t *aiocbp) +{ + aio_req_t *reqp; + aio_worker_t *aiowp; + int done = 0; + int canceled = 0; + struct stat buf; + + if (fstat(fd, &buf) < 0) + return (-1); + + if (aiocbp != NULL) { + if (fd != aiocbp->aio_fildes) { + errno = EINVAL; + return (-1); + } + if (aiocbp->aio_state == USERAIO) { + sig_mutex_lock(&__aio_mutex); + reqp = _aio_hash_find(&aiocbp->aio_resultp); + if (reqp == NULL) { + sig_mutex_unlock(&__aio_mutex); + return (AIO_ALLDONE); + } + aiowp = reqp->req_worker; + sig_mutex_lock(&aiowp->work_qlock1); + (void) _aio_cancel_req(aiowp, reqp, &canceled, &done); + sig_mutex_unlock(&aiowp->work_qlock1); + sig_mutex_unlock(&__aio_mutex); + if (done) + return (AIO_ALLDONE); + if (canceled) + return (AIO_CANCELED); + return (AIO_NOTCANCELED); + } + if (aiocbp->aio_state == USERAIO_DONE) + return (AIO_ALLDONE); + return ((int)_kaio(AIOCANCEL, fd, aiocbp)); + } + + return (aiocancel_all(fd)); +} + +int +aio_waitn64(aiocb64_t *list[], uint_t nent, uint_t *nwait, + const timespec_t *timeout) +{ + return (__aio_waitn((void **)list, nent, nwait, timeout)); +} + +#endif /* !defined(_LP64) */ diff --git a/usr/src/lib/libc/port/gen/event_port.c b/usr/src/lib/libc/port/gen/event_port.c index 84ade99164..f4eb057dec 100644 --- a/usr/src/lib/libc/port/gen/event_port.c +++ b/usr/src/lib/libc/port/gen/event_port.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,7 +34,6 @@ #pragma weak port_get = _port_get #pragma weak port_getn = _port_getn #pragma weak port_alert = _port_alert -#pragma weak port_dispatch = _port_dispatch #include "lint.h" #include <sys/types.h> @@ -128,7 +127,7 @@ _port_send(int port, int events, void *user) } /* - * port_dispatch() will block if there are not resources available to + * _port_dispatch() will block if there are not resources available to * satisfy the request. */ diff --git a/usr/src/lib/libc/port/llib-lc b/usr/src/lib/libc/port/llib-lc index 0c213a116c..502aa4cb33 100644 --- a/usr/src/lib/libc/port/llib-lc +++ b/usr/src/lib/libc/port/llib-lc @@ -29,6 +29,7 @@ /* LINTLIBRARY */ /* PROTOLIB1 */ +#include <aio.h> #include <alloca.h> #include <atomic.h> #include <ctype.h> @@ -51,6 +52,7 @@ #include <locale.h> #include <memory.h> #include <mon.h> +#include <mqueue.h> #include <nan.h> #include <ndbm.h> #include <limits.h> @@ -61,7 +63,9 @@ #include <rctl.h> #include <regex.h> #include <rpcsvc/ypclnt.h> +#include <sched.h> #include <search.h> +#include <semaphore.h> #include <setjmp.h> #include <shadow.h> #include <siginfo.h> @@ -80,6 +84,7 @@ #include <synch.h> #include <sys/acctctl.h> #include <sys/acl.h> +#include <sys/asynch.h> #include <sys/byteorder.h> #include <sys/cladm.h> #include <sys/corectl.h> diff --git a/usr/src/lib/libc/port/rt/clock_timer.c b/usr/src/lib/libc/port/rt/clock_timer.c new file mode 100644 index 0000000000..8dfb35be91 --- /dev/null +++ b/usr/src/lib/libc/port/rt/clock_timer.c @@ -0,0 +1,179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#pragma weak clock_getres = _clock_getres +#pragma weak clock_gettime = _clock_gettime +#pragma weak clock_settime = _clock_settime +#pragma weak timer_create = _timer_create +#pragma weak timer_delete = _timer_delete +#pragma weak timer_getoverrun = _timer_getoverrun +#pragma weak timer_gettime = _timer_gettime +#pragma weak timer_settime = _timer_settime + +#include "synonyms.h" +#include <time.h> +#include <sys/types.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include "sigev_thread.h" + +/* + * System call wrappers found elsewhere in libc (common/sys/__clock_timer.s). + */ +extern int __clock_getres(clockid_t, timespec_t *); +extern int __clock_gettime(clockid_t, timespec_t *); +extern int __clock_settime(clockid_t, const timespec_t *); +extern int __timer_create(clockid_t, struct sigevent *, timer_t *); +extern int __timer_delete(timer_t); +extern int __timer_getoverrun(timer_t); +extern int __timer_gettime(timer_t, itimerspec_t *); +extern int __timer_settime(timer_t, int, const itimerspec_t *, itimerspec_t *); + +/* + * Array of pointers to tcd's, indexed by timer id. + * No more than 'timer_max' timers can be created by any process. + */ +int timer_max = 0; +thread_communication_data_t **timer_tcd; +static pthread_once_t timer_once = PTHREAD_ONCE_INIT; + +static void +timer_init(void) +{ + timer_max = (int)_sysconf(_SC_TIMER_MAX); + timer_tcd = malloc(timer_max * sizeof (*timer_tcd)); + (void) memset(timer_tcd, 0, timer_max * sizeof (*timer_tcd)); +} + +int +_clock_getres(clockid_t clock_id, timespec_t *res) +{ + return (__clock_getres(clock_id, res)); +} + +int +_clock_gettime(clockid_t clock_id, timespec_t *tp) +{ + return (__clock_gettime(clock_id, tp)); +} + +int +_clock_settime(clockid_t clock_id, const timespec_t *tp) +{ + return (__clock_settime(clock_id, tp)); +} + +int +_timer_create(clockid_t clock_id, struct sigevent *sigevp, timer_t *timerid) +{ + struct sigevent sigevent; + port_notify_t port_notify; + thread_communication_data_t *tcdp; + int sigev_thread = 0; + int rc; + + (void) pthread_once(&timer_once, timer_init); + + if (sigevp != NULL && + sigevp->sigev_notify == SIGEV_THREAD && + sigevp->sigev_notify_function != NULL) { + sigev_thread = 1; + tcdp = setup_sigev_handler(sigevp, TIMER); + if (tcdp == NULL) + return (-1); + /* copy the sigevent structure so we can modify it */ + sigevent = *sigevp; + sigevp = &sigevent; + port_notify.portnfy_port = tcdp->tcd_port; + port_notify.portnfy_user = NULL; + sigevp->sigev_value.sival_ptr = &port_notify; + } + + rc = __timer_create(clock_id, sigevp, timerid); + + if (sigev_thread) { + if (rc == 0) { + if ((rc = launch_spawner(tcdp)) != 0) + __timer_delete(*timerid); + else + timer_tcd[*timerid] = tcdp; + } + if (rc != 0) + free_sigev_handler(tcdp); + } + + return (rc); +} + +int +_timer_delete(timer_t timerid) +{ + int rc; + + if ((rc = del_sigev_timer(timerid)) == 0) + return (__timer_delete(timerid)); + else + return (rc); +} + +int +_timer_getoverrun(timer_t timerid) +{ + return (__timer_getoverrun(timerid) + sigev_timer_getoverrun(timerid)); +} + +int +_timer_gettime(timer_t timerid, itimerspec_t *value) +{ + return (__timer_gettime(timerid, value)); +} + +int +_timer_settime(timer_t timerid, int flags, const itimerspec_t *value, + itimerspec_t *ovalue) +{ + return (__timer_settime(timerid, flags, value, ovalue)); +} + +/* + * Cleanup after fork1() in the child process. + */ +void +postfork1_child_sigev_timer(void) +{ + thread_communication_data_t *tcdp; + int timer; + + for (timer = 0; timer < timer_max; timer++) { + if ((tcdp = timer_tcd[timer]) != NULL) { + timer_tcd[timer] = NULL; + tcd_teardown(tcdp); + } + } +} diff --git a/usr/src/lib/libc/port/rt/fallocate.c b/usr/src/lib/libc/port/rt/fallocate.c new file mode 100644 index 0000000000..17b9088052 --- /dev/null +++ b/usr/src/lib/libc/port/rt/fallocate.c @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> + +#include <stdio.h> + +int +posix_fallocate(int fd, off_t offset, off_t len) +{ + struct flock lck; + + lck.l_whence = 0; + lck.l_start = offset; + lck.l_len = len; + lck.l_type = F_WRLCK; + + if (fcntl(fd, F_ALLOCSP, &lck) == -1) { + return (-1); + } + + return (0); +} + +#if !defined(_LP64) + +int +posix_fallocate64(int fd, off64_t offset, off64_t len) +{ + struct flock64 lck; + + lck.l_whence = 0; + lck.l_start = offset; + lck.l_len = len; + lck.l_type = F_WRLCK; + + if (fcntl(fd, F_ALLOCSP64, &lck) == -1) { + return (-1); + } + + return (0); +} + +#endif diff --git a/usr/src/lib/libc/port/rt/mqueue.c b/usr/src/lib/libc/port/rt/mqueue.c new file mode 100644 index 0000000000..ebab58a259 --- /dev/null +++ b/usr/src/lib/libc/port/rt/mqueue.c @@ -0,0 +1,1101 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#pragma weak mq_open = _mq_open +#pragma weak mq_close = _mq_close +#pragma weak mq_unlink = _mq_unlink +#pragma weak mq_send = _mq_send +#pragma weak mq_timedsend = _mq_timedsend +#pragma weak mq_reltimedsend_np = _mq_reltimedsend_np +#pragma weak mq_receive = _mq_receive +#pragma weak mq_timedreceive = _mq_timedreceive +#pragma weak mq_reltimedreceive_np = _mq_reltimedreceive_np +#pragma weak mq_notify = _mq_notify +#pragma weak mq_setattr = _mq_setattr +#pragma weak mq_getattr = _mq_getattr + +#include "synonyms.h" +#include "mtlib.h" +#define _KMEMUSER +#include <sys/param.h> /* _MQ_OPEN_MAX, _MQ_PRIO_MAX, _SEM_VALUE_MAX */ +#undef _KMEMUSER +#include <mqueue.h> +#include <sys/types.h> +#include <sys/file.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdarg.h> +#include <limits.h> +#include <pthread.h> +#include <assert.h> +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <inttypes.h> +#include "sigev_thread.h" +#include "pos4obj.h" + +/* + * Default values per message queue + */ +#define MQ_MAXMSG 128 +#define MQ_MAXSIZE 1024 + +#define MQ_MAGIC 0x4d534751 /* "MSGQ" */ + +/* + * Message header which is part of messages in link list + */ +typedef struct { + uint64_t msg_next; /* offset of next message in the link */ + uint64_t msg_len; /* length of the message */ +} msghdr_t; + +/* + * message queue description + */ +struct mq_dn { + size_t mqdn_flags; /* open description flags */ +}; + +/* + * message queue descriptor structure + */ +typedef struct mq_des { + struct mq_des *mqd_next; /* list of all open mq descriptors, */ + struct mq_des *mqd_prev; /* needed for fork-safety */ + int mqd_magic; /* magic # to identify mq_des */ + int mqd_flags; /* operation flag per open */ + struct mq_header *mqd_mq; /* address pointer of message Q */ + struct mq_dn *mqd_mqdn; /* open description */ + thread_communication_data_t *mqd_tcd; /* SIGEV_THREAD notification */ +} mqdes_t; + +/* + * message queue common header, part of the mmap()ed file. + * Since message queues may be shared between 32- and 64-bit processes, + * care must be taken to make sure that the elements of this structure + * are identical for both _LP64 and _ILP32 cases. + */ +typedef struct mq_header { + /* first field must be mq_totsize, DO NOT insert before this */ + int64_t mq_totsize; /* total size of the Queue */ + int64_t mq_maxsz; /* max size of each message */ + uint32_t mq_maxmsg; /* max messages in the queue */ + uint32_t mq_maxprio; /* maximum mqueue priority */ + uint32_t mq_curmaxprio; /* current maximum MQ priority */ + uint32_t mq_mask; /* priority bitmask */ + uint64_t mq_freep; /* free message's head pointer */ + uint64_t mq_headpp; /* pointer to head pointers */ + uint64_t mq_tailpp; /* pointer to tail pointers */ + signotify_id_t mq_sigid; /* notification id (3 int's) */ + uint32_t mq_ntype; /* notification type (SIGEV_*) */ + uint64_t mq_des; /* pointer to msg Q descriptor */ + mutex_t mq_exclusive; /* acquire for exclusive access */ + sem_t mq_rblocked; /* number of processes rblocked */ + sem_t mq_notfull; /* mq_send()'s block on this */ + sem_t mq_notempty; /* mq_receive()'s block on this */ + sem_t mq_spawner; /* spawner thread blocks on this */ +} mqhdr_t; + +/* + * The code assumes that _MQ_OPEN_MAX == -1 or "no fixed implementation limit". + * If this assumption is somehow invalidated, mq_open() needs to be changed + * back to the old version which kept a count and enforced a limit. + * We make sure that this is pointed out to those changing <sys/param.h> + * by checking _MQ_OPEN_MAX at compile time. + */ +#if _MQ_OPEN_MAX != -1 +#error "mq_open() no longer enforces _MQ_OPEN_MAX and needs fixing." +#endif + +#define MQ_ALIGNSIZE 8 /* 64-bit alignment */ + +#ifdef DEBUG +#define MQ_ASSERT(x) assert(x); + +#define MQ_ASSERT_PTR(_m, _p) \ + assert((_p) != NULL && !((uintptr_t)(_p) & (MQ_ALIGNSIZE -1)) && \ + !((uintptr_t)_m + (uintptr_t)(_p) >= (uintptr_t)_m + \ + _m->mq_totsize)); + +#define MQ_ASSERT_SEMVAL_LEQ(sem, val) { \ + int _val; \ + (void) sem_getvalue((sem), &_val); \ + assert((_val) <= val); } +#else +#define MQ_ASSERT(x) +#define MQ_ASSERT_PTR(_m, _p) +#define MQ_ASSERT_SEMVAL_LEQ(sem, val) +#endif + +#define MQ_PTR(m, n) ((msghdr_t *)((uintptr_t)m + (uintptr_t)n)) +#define HEAD_PTR(m, n) ((uint64_t *)((uintptr_t)m + \ + (uintptr_t)m->mq_headpp + n * sizeof (uint64_t))) +#define TAIL_PTR(m, n) ((uint64_t *)((uintptr_t)m + \ + (uintptr_t)m->mq_tailpp + n * sizeof (uint64_t))) + +#define MQ_RESERVED ((mqdes_t *)-1) + +#define ABS_TIME 0 +#define REL_TIME 1 + +static mutex_t mq_list_lock = DEFAULTMUTEX; +static mqdes_t *mq_list = NULL; + +extern int __signotify(int cmd, siginfo_t *sigonfo, signotify_id_t *sn_id); + +static int +mq_is_valid(mqdes_t *mqdp) +{ + /* + * Any use of a message queue after it was closed is + * undefined. But the standard strongly favours EBADF + * returns. Before we dereference which could be fatal, + * we first do some pointer sanity checks. + */ + if (mqdp != NULL && mqdp != MQ_RESERVED && + ((uintptr_t)mqdp & 0x7) == 0) { + return (mqdp->mqd_magic == MQ_MAGIC); + } + + return (0); +} + +static void +mq_init(mqhdr_t *mqhp, size_t msgsize, ssize_t maxmsg) +{ + int i; + uint64_t temp; + uint64_t currentp; + uint64_t nextp; + + /* + * We only need to initialize the non-zero fields. The use of + * ftruncate() on the message queue file assures that the + * pages will be zfod. + */ + (void) mutex_init(&mqhp->mq_exclusive, USYNC_PROCESS, NULL); + (void) sem_init(&mqhp->mq_rblocked, 1, 0); + (void) sem_init(&mqhp->mq_notempty, 1, 0); + (void) sem_init(&mqhp->mq_spawner, 1, 0); + (void) sem_init(&mqhp->mq_notfull, 1, (uint_t)maxmsg); + + mqhp->mq_maxsz = msgsize; + mqhp->mq_maxmsg = maxmsg; + + /* + * As of this writing (1997), there are 32 message queue priorities. + * If this is to change, then the size of the mq_mask will + * also have to change. If DEBUG is defined, assert that + * _MQ_PRIO_MAX hasn't changed. + */ + mqhp->mq_maxprio = _MQ_PRIO_MAX; +#if defined(DEBUG) + /* LINTED always true */ + MQ_ASSERT(sizeof (mqhp->mq_mask) * 8 >= _MQ_PRIO_MAX); +#endif + + /* + * Since the message queue can be mapped into different + * virtual address ranges by different processes, we don't + * keep track of pointers, only offsets into the shared region. + */ + mqhp->mq_headpp = sizeof (mqhdr_t); + mqhp->mq_tailpp = mqhp->mq_headpp + + mqhp->mq_maxprio * sizeof (uint64_t); + mqhp->mq_freep = mqhp->mq_tailpp + + mqhp->mq_maxprio * sizeof (uint64_t); + + currentp = mqhp->mq_freep; + MQ_PTR(mqhp, currentp)->msg_next = 0; + + temp = (mqhp->mq_maxsz + MQ_ALIGNSIZE - 1) & ~(MQ_ALIGNSIZE - 1); + for (i = 1; i < mqhp->mq_maxmsg; i++) { + nextp = currentp + sizeof (msghdr_t) + temp; + MQ_PTR(mqhp, currentp)->msg_next = nextp; + MQ_PTR(mqhp, nextp)->msg_next = 0; + currentp = nextp; + } +} + +static size_t +mq_getmsg(mqhdr_t *mqhp, char *msgp, uint_t *msg_prio) +{ + uint64_t currentp; + msghdr_t *curbuf; + uint64_t *headpp; + uint64_t *tailpp; + + MQ_ASSERT(MUTEX_HELD(&mqhp->mq_exclusive)); + + /* + * Get the head and tail pointers for the queue of maximum + * priority. We shouldn't be here unless there is a message for + * us, so it's fair to assert that both the head and tail + * pointers are non-NULL. + */ + headpp = HEAD_PTR(mqhp, mqhp->mq_curmaxprio); + tailpp = TAIL_PTR(mqhp, mqhp->mq_curmaxprio); + + if (msg_prio != NULL) + *msg_prio = mqhp->mq_curmaxprio; + + currentp = *headpp; + MQ_ASSERT_PTR(mqhp, currentp); + curbuf = MQ_PTR(mqhp, currentp); + + if ((*headpp = curbuf->msg_next) == NULL) { + /* + * We just nuked the last message in this priority's queue. + * Twiddle this priority's bit, and then find the next bit + * tipped. + */ + uint_t prio = mqhp->mq_curmaxprio; + + mqhp->mq_mask &= ~(1u << prio); + + for (; prio != 0; prio--) + if (mqhp->mq_mask & (1u << prio)) + break; + mqhp->mq_curmaxprio = prio; + + *tailpp = NULL; + } + + /* + * Copy the message, and put the buffer back on the free list. + */ + (void) memcpy(msgp, (char *)&curbuf[1], curbuf->msg_len); + curbuf->msg_next = mqhp->mq_freep; + mqhp->mq_freep = currentp; + + return (curbuf->msg_len); +} + + +static void +mq_putmsg(mqhdr_t *mqhp, const char *msgp, ssize_t len, uint_t prio) +{ + uint64_t currentp; + msghdr_t *curbuf; + uint64_t *headpp; + uint64_t *tailpp; + + MQ_ASSERT(MUTEX_HELD(&mqhp->mq_exclusive)); + + /* + * Grab a free message block, and link it in. We shouldn't + * be here unless there is room in the queue for us; it's + * fair to assert that the free pointer is non-NULL. + */ + currentp = mqhp->mq_freep; + MQ_ASSERT_PTR(mqhp, currentp); + curbuf = MQ_PTR(mqhp, currentp); + + /* + * Remove a message from the free list, and copy in the new contents. + */ + mqhp->mq_freep = curbuf->msg_next; + curbuf->msg_next = NULL; + (void) memcpy((char *)&curbuf[1], msgp, len); + curbuf->msg_len = len; + + headpp = HEAD_PTR(mqhp, prio); + tailpp = TAIL_PTR(mqhp, prio); + + if (*tailpp == 0) { + /* + * This is the first message on this queue. Set the + * head and tail pointers, and tip the appropriate bit + * in the priority mask. + */ + *headpp = currentp; + *tailpp = currentp; + mqhp->mq_mask |= (1u << prio); + if (prio > mqhp->mq_curmaxprio) + mqhp->mq_curmaxprio = prio; + } else { + MQ_ASSERT_PTR(mqhp, *tailpp); + MQ_PTR(mqhp, *tailpp)->msg_next = currentp; + *tailpp = currentp; + } +} + +mqd_t +_mq_open(const char *path, int oflag, /* mode_t mode, mq_attr *attr */ ...) +{ + va_list ap; + mode_t mode; + struct mq_attr *attr; + int fd; + int err; + int cr_flag = 0; + int locked = 0; + uint64_t total_size; + size_t msgsize; + ssize_t maxmsg; + uint64_t temp; + void *ptr; + mqdes_t *mqdp; + mqhdr_t *mqhp; + struct mq_dn *mqdnp; + + if (__pos4obj_check(path) == -1) + return ((mqd_t)-1); + + /* acquire MSGQ lock to have atomic operation */ + if (__pos4obj_lock(path, MQ_LOCK_TYPE) < 0) + goto out; + locked = 1; + + va_start(ap, oflag); + /* filter oflag to have READ/WRITE/CREATE modes only */ + oflag = oflag & (O_RDONLY|O_WRONLY|O_RDWR|O_CREAT|O_EXCL|O_NONBLOCK); + if ((oflag & O_CREAT) != 0) { + mode = va_arg(ap, mode_t); + attr = va_arg(ap, struct mq_attr *); + } + va_end(ap); + + if ((fd = __pos4obj_open(path, MQ_PERM_TYPE, oflag, + mode, &cr_flag)) < 0) + goto out; + + /* closing permission file */ + (void) __close_nc(fd); + + /* Try to open/create data file */ + if (cr_flag) { + cr_flag = PFILE_CREATE; + if (attr == NULL) { + maxmsg = MQ_MAXMSG; + msgsize = MQ_MAXSIZE; + } else if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0) { + errno = EINVAL; + goto out; + } else if (attr->mq_maxmsg > _SEM_VALUE_MAX) { + errno = ENOSPC; + goto out; + } else { + maxmsg = attr->mq_maxmsg; + msgsize = attr->mq_msgsize; + } + + /* adjust for message size at word boundary */ + temp = (msgsize + MQ_ALIGNSIZE - 1) & ~(MQ_ALIGNSIZE - 1); + + total_size = sizeof (mqhdr_t) + + maxmsg * (temp + sizeof (msghdr_t)) + + 2 * _MQ_PRIO_MAX * sizeof (uint64_t); + + if (total_size > SSIZE_MAX) { + errno = ENOSPC; + goto out; + } + + /* + * data file is opened with read/write to those + * who have read or write permission + */ + mode = mode | (mode & 0444) >> 1 | (mode & 0222) << 1; + if ((fd = __pos4obj_open(path, MQ_DATA_TYPE, + (O_RDWR|O_CREAT|O_EXCL), mode, &err)) < 0) + goto out; + + cr_flag |= DFILE_CREATE | DFILE_OPEN; + + /* force permissions to avoid umask effect */ + if (fchmod(fd, mode) < 0) + goto out; + + if (ftruncate64(fd, (off64_t)total_size) < 0) + goto out; + } else { + if ((fd = __pos4obj_open(path, MQ_DATA_TYPE, + O_RDWR, 0666, &err)) < 0) + goto out; + cr_flag = DFILE_OPEN; + + /* Message queue has not been initialized yet */ + if (read(fd, &total_size, sizeof (total_size)) != + sizeof (total_size) || total_size == 0) { + errno = ENOENT; + goto out; + } + + /* Message queue too big for this process to handle */ + if (total_size > SSIZE_MAX) { + errno = EFBIG; + goto out; + } + } + + if ((mqdp = (mqdes_t *)malloc(sizeof (mqdes_t))) == NULL) { + errno = ENOMEM; + goto out; + } + cr_flag |= ALLOC_MEM; + + if ((ptr = mmap64(NULL, total_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, (off64_t)0)) == MAP_FAILED) + goto out; + mqhp = ptr; + cr_flag |= DFILE_MMAP; + + /* closing data file */ + (void) __close_nc(fd); + cr_flag &= ~DFILE_OPEN; + + /* + * create, unlink, size, mmap, and close description file + * all for a flag word in anonymous shared memory + */ + if ((fd = __pos4obj_open(path, MQ_DSCN_TYPE, O_RDWR | O_CREAT, + 0666, &err)) < 0) + goto out; + cr_flag |= DFILE_OPEN; + (void) __pos4obj_unlink(path, MQ_DSCN_TYPE); + if (ftruncate64(fd, (off64_t)sizeof (struct mq_dn)) < 0) + goto out; + + if ((ptr = mmap64(NULL, sizeof (struct mq_dn), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, (off64_t)0)) == MAP_FAILED) + goto out; + mqdnp = ptr; + cr_flag |= MQDNP_MMAP; + + (void) __close_nc(fd); + cr_flag &= ~DFILE_OPEN; + + /* + * we follow the same strategy as filesystem open() routine, + * where fcntl.h flags are changed to flags defined in file.h. + */ + mqdp->mqd_flags = (oflag - FOPEN) & (FREAD|FWRITE); + mqdnp->mqdn_flags = (oflag - FOPEN) & (FNONBLOCK); + + /* new message queue requires initialization */ + if ((cr_flag & DFILE_CREATE) != 0) { + /* message queue header has to be initialized */ + mq_init(mqhp, msgsize, maxmsg); + mqhp->mq_totsize = total_size; + } + mqdp->mqd_mq = mqhp; + mqdp->mqd_mqdn = mqdnp; + mqdp->mqd_magic = MQ_MAGIC; + mqdp->mqd_tcd = NULL; + if (__pos4obj_unlock(path, MQ_LOCK_TYPE) == 0) { + lmutex_lock(&mq_list_lock); + mqdp->mqd_next = mq_list; + mqdp->mqd_prev = NULL; + if (mq_list) + mq_list->mqd_prev = mqdp; + mq_list = mqdp; + lmutex_unlock(&mq_list_lock); + return ((mqd_t)mqdp); + } + + locked = 0; /* fall into the error case */ +out: + err = errno; + if ((cr_flag & DFILE_OPEN) != 0) + (void) __close_nc(fd); + if ((cr_flag & DFILE_CREATE) != 0) + (void) __pos4obj_unlink(path, MQ_DATA_TYPE); + if ((cr_flag & PFILE_CREATE) != 0) + (void) __pos4obj_unlink(path, MQ_PERM_TYPE); + if ((cr_flag & ALLOC_MEM) != 0) + free((void *)mqdp); + if ((cr_flag & DFILE_MMAP) != 0) + (void) munmap((caddr_t)mqhp, (size_t)total_size); + if ((cr_flag & MQDNP_MMAP) != 0) + (void) munmap((caddr_t)mqdnp, sizeof (struct mq_dn)); + if (locked) + (void) __pos4obj_unlock(path, MQ_LOCK_TYPE); + errno = err; + return ((mqd_t)-1); +} + +static void +mq_close_cleanup(mqdes_t *mqdp) +{ + mqhdr_t *mqhp = mqdp->mqd_mq; + struct mq_dn *mqdnp = mqdp->mqd_mqdn; + + /* invalidate the descriptor before freeing it */ + mqdp->mqd_magic = 0; + (void) mutex_unlock(&mqhp->mq_exclusive); + + lmutex_lock(&mq_list_lock); + if (mqdp->mqd_next) + mqdp->mqd_next->mqd_prev = mqdp->mqd_prev; + if (mqdp->mqd_prev) + mqdp->mqd_prev->mqd_next = mqdp->mqd_next; + if (mq_list == mqdp) + mq_list = mqdp->mqd_next; + lmutex_unlock(&mq_list_lock); + + free(mqdp); + (void) munmap((caddr_t)mqdnp, sizeof (struct mq_dn)); + (void) munmap((caddr_t)mqhp, (size_t)mqhp->mq_totsize); +} + +int +_mq_close(mqd_t mqdes) +{ + mqdes_t *mqdp = (mqdes_t *)mqdes; + mqhdr_t *mqhp; + thread_communication_data_t *tcdp; + + if (!mq_is_valid(mqdp)) { + errno = EBADF; + return (-1); + } + + mqhp = mqdp->mqd_mq; + (void) mutex_lock(&mqhp->mq_exclusive); + + if (mqhp->mq_des == (uintptr_t)mqdp && + mqhp->mq_sigid.sn_pid == getpid()) { + /* notification is set for this descriptor, remove it */ + (void) __signotify(SN_CANCEL, NULL, &mqhp->mq_sigid); + mqhp->mq_ntype = 0; + mqhp->mq_des = 0; + } + + pthread_cleanup_push(mq_close_cleanup, mqdp); + if ((tcdp = mqdp->mqd_tcd) != NULL) { + mqdp->mqd_tcd = NULL; + del_sigev_mq(tcdp); /* possible cancellation point */ + } + pthread_cleanup_pop(1); /* finish in the cleanup handler */ + + return (0); +} + +int +_mq_unlink(const char *path) +{ + int err; + + if (__pos4obj_check(path) < 0) + return (-1); + + if (__pos4obj_lock(path, MQ_LOCK_TYPE) < 0) { + return (-1); + } + + err = __pos4obj_unlink(path, MQ_PERM_TYPE); + + if (err == 0 || (err == -1 && errno == EEXIST)) { + errno = 0; + err = __pos4obj_unlink(path, MQ_DATA_TYPE); + } + + if (__pos4obj_unlock(path, MQ_LOCK_TYPE) < 0) + return (-1); + + return (err); + +} + +static int +__mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, + uint_t msg_prio, const timespec_t *timeout, int abs_rel) +{ + mqdes_t *mqdp = (mqdes_t *)mqdes; + mqhdr_t *mqhp; + int err; + int notify = 0; + + /* + * sem_*wait() does cancellation, if called. + * pthread_testcancel() ensures that cancellation takes place if + * there is a cancellation pending when mq_*send() is called. + */ + pthread_testcancel(); + + if (!mq_is_valid(mqdp) || (mqdp->mqd_flags & FWRITE) == 0) { + errno = EBADF; + return (-1); + } + + mqhp = mqdp->mqd_mq; + + if (msg_prio >= mqhp->mq_maxprio) { + errno = EINVAL; + return (-1); + } + if (msg_len > mqhp->mq_maxsz) { + errno = EMSGSIZE; + return (-1); + } + + if (mqdp->mqd_mqdn->mqdn_flags & O_NONBLOCK) + err = sem_trywait(&mqhp->mq_notfull); + else { + /* + * We might get cancelled here... + */ + if (timeout == NULL) + err = sem_wait(&mqhp->mq_notfull); + else if (abs_rel == ABS_TIME) + err = sem_timedwait(&mqhp->mq_notfull, timeout); + else + err = sem_reltimedwait_np(&mqhp->mq_notfull, timeout); + } + if (err == -1) { + /* + * errno has been set to EAGAIN / EINTR / ETIMEDOUT + * by sem_*wait(), so we can just return. + */ + return (-1); + } + + /* + * By the time we're here, we know that we've got the capacity + * to add to the queue...now acquire the exclusive lock. + */ + (void) mutex_lock(&mqhp->mq_exclusive); + + /* + * Now determine if we want to kick the notification. POSIX + * requires that if a process has registered for notification, + * we must kick it when the queue makes an empty to non-empty + * transition, and there are no blocked receivers. Note that + * this mechanism does _not_ guarantee that the kicked process + * will be able to receive a message without blocking; + * another receiver could intervene in the meantime. Thus, + * the notification mechanism is inherently racy; all we can + * do is hope to minimize the window as much as possible. + * In general, we want to avoid kicking the notification when + * there are clearly receivers blocked. We'll determine if + * we want to kick the notification before the mq_putmsg(), + * but the actual signotify() won't be done until the message + * is on the queue. + */ + if (mqhp->mq_sigid.sn_pid != 0) { + int nmessages, nblocked; + + (void) sem_getvalue(&mqhp->mq_notempty, &nmessages); + (void) sem_getvalue(&mqhp->mq_rblocked, &nblocked); + + if (nmessages == 0 && nblocked == 0) + notify = 1; + } + + mq_putmsg(mqhp, msg_ptr, (ssize_t)msg_len, msg_prio); + (void) sem_post(&mqhp->mq_notempty); + + if (notify) { + /* notify and also delete the registration */ + (void) __signotify(SN_SEND, NULL, &mqhp->mq_sigid); + if (mqhp->mq_ntype == SIGEV_THREAD || + mqhp->mq_ntype == SIGEV_PORT) + (void) sem_post(&mqhp->mq_spawner); + mqhp->mq_ntype = 0; + mqhp->mq_des = 0; + } + + MQ_ASSERT_SEMVAL_LEQ(&mqhp->mq_notempty, ((int)mqhp->mq_maxmsg)); + (void) mutex_unlock(&mqhp->mq_exclusive); + + return (0); +} + +int +_mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, uint_t msg_prio) +{ + return (__mq_timedsend(mqdes, msg_ptr, msg_len, msg_prio, + NULL, ABS_TIME)); +} + +int +_mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, + uint_t msg_prio, const timespec_t *abs_timeout) +{ + return (__mq_timedsend(mqdes, msg_ptr, msg_len, msg_prio, + abs_timeout, ABS_TIME)); +} + +int +_mq_reltimedsend_np(mqd_t mqdes, const char *msg_ptr, size_t msg_len, + uint_t msg_prio, const timespec_t *rel_timeout) +{ + return (__mq_timedsend(mqdes, msg_ptr, msg_len, msg_prio, + rel_timeout, REL_TIME)); +} + +static void +decrement_rblocked(mqhdr_t *mqhp) +{ + int canstate; + + (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &canstate); + while (sem_wait(&mqhp->mq_rblocked) == -1) + continue; + (void) pthread_setcancelstate(canstate, NULL); +} + +static ssize_t +__mq_timedreceive(mqd_t mqdes, char *msg_ptr, size_t msg_len, + uint_t *msg_prio, const timespec_t *timeout, int abs_rel) +{ + mqdes_t *mqdp = (mqdes_t *)mqdes; + mqhdr_t *mqhp; + ssize_t msg_size; + int err; + + /* + * sem_*wait() does cancellation, if called. + * pthread_testcancel() ensures that cancellation takes place if + * there is a cancellation pending when mq_*receive() is called. + */ + pthread_testcancel(); + + if (!mq_is_valid(mqdp) || (mqdp->mqd_flags & FREAD) == 0) { + errno = EBADF; + return (ssize_t)(-1); + } + + mqhp = mqdp->mqd_mq; + + if (msg_len < mqhp->mq_maxsz) { + errno = EMSGSIZE; + return (ssize_t)(-1); + } + + /* + * The semaphoring scheme for mq_[timed]receive is a little hairy + * thanks to POSIX.1b's arcane notification mechanism. First, + * we try to take the common case and do a sem_trywait(). + * If that doesn't work, and O_NONBLOCK hasn't been set, + * then note that we're going to sleep by incrementing the rblocked + * semaphore. We decrement that semaphore after waking up. + */ + if (sem_trywait(&mqhp->mq_notempty) == -1) { + if ((mqdp->mqd_mqdn->mqdn_flags & O_NONBLOCK) != 0) { + /* + * errno has been set to EAGAIN or EINTR by + * sem_trywait(), so we can just return. + */ + return (-1); + } + /* + * If we're here, then we're probably going to block... + * increment the rblocked semaphore. If we get + * cancelled, decrement_rblocked() will decrement it. + */ + (void) sem_post(&mqhp->mq_rblocked); + + pthread_cleanup_push(decrement_rblocked, mqhp); + if (timeout == NULL) + err = sem_wait(&mqhp->mq_notempty); + else if (abs_rel == ABS_TIME) + err = sem_timedwait(&mqhp->mq_notempty, timeout); + else + err = sem_reltimedwait_np(&mqhp->mq_notempty, timeout); + pthread_cleanup_pop(1); + + if (err == -1) { + /* + * We took a signal or timeout while waiting + * on mq_notempty... + */ + return (-1); + } + } + + (void) mutex_lock(&mqhp->mq_exclusive); + msg_size = mq_getmsg(mqhp, msg_ptr, msg_prio); + (void) sem_post(&mqhp->mq_notfull); + MQ_ASSERT_SEMVAL_LEQ(&mqhp->mq_notfull, ((int)mqhp->mq_maxmsg)); + (void) mutex_unlock(&mqhp->mq_exclusive); + + return (msg_size); +} + +ssize_t +_mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, uint_t *msg_prio) +{ + return (__mq_timedreceive(mqdes, msg_ptr, msg_len, msg_prio, + NULL, ABS_TIME)); +} + +ssize_t +_mq_timedreceive(mqd_t mqdes, char *msg_ptr, size_t msg_len, + uint_t *msg_prio, const timespec_t *abs_timeout) +{ + return (__mq_timedreceive(mqdes, msg_ptr, msg_len, msg_prio, + abs_timeout, ABS_TIME)); +} + +ssize_t +_mq_reltimedreceive_np(mqd_t mqdes, char *msg_ptr, size_t msg_len, + uint_t *msg_prio, const timespec_t *rel_timeout) +{ + return (__mq_timedreceive(mqdes, msg_ptr, msg_len, msg_prio, + rel_timeout, REL_TIME)); +} + +/* + * Only used below, in _mq_notify(). + * We already have a spawner thread. + * Verify that the attributes match; cancel it if necessary. + */ +static int +cancel_if_necessary(thread_communication_data_t *tcdp, + const struct sigevent *sigevp) +{ + int do_cancel = !_pthread_attr_equal(tcdp->tcd_attrp, + sigevp->sigev_notify_attributes); + + if (do_cancel) { + /* + * Attributes don't match, cancel the spawner thread. + */ + (void) pthread_cancel(tcdp->tcd_server_id); + } else { + /* + * Reuse the existing spawner thread with possibly + * changed notification function and value. + */ + tcdp->tcd_notif.sigev_notify = SIGEV_THREAD; + tcdp->tcd_notif.sigev_signo = 0; + tcdp->tcd_notif.sigev_value = sigevp->sigev_value; + tcdp->tcd_notif.sigev_notify_function = + sigevp->sigev_notify_function; + } + + return (do_cancel); +} + +int +_mq_notify(mqd_t mqdes, const struct sigevent *sigevp) +{ + mqdes_t *mqdp = (mqdes_t *)mqdes; + mqhdr_t *mqhp; + thread_communication_data_t *tcdp; + siginfo_t mq_siginfo; + struct sigevent sigevent; + struct stat64 statb; + port_notify_t *pn; + void *userval; + int rval = -1; + int ntype; + int port; + + if (!mq_is_valid(mqdp)) { + errno = EBADF; + return (-1); + } + + mqhp = mqdp->mqd_mq; + + (void) mutex_lock(&mqhp->mq_exclusive); + + if (sigevp == NULL) { /* remove notification */ + if (mqhp->mq_des == (uintptr_t)mqdp && + mqhp->mq_sigid.sn_pid == getpid()) { + /* notification is set for this descriptor, remove it */ + (void) __signotify(SN_CANCEL, NULL, &mqhp->mq_sigid); + if ((tcdp = mqdp->mqd_tcd) != NULL) { + sig_mutex_lock(&tcdp->tcd_lock); + if (tcdp->tcd_msg_enabled) { + /* cancel the spawner thread */ + tcdp = mqdp->mqd_tcd; + mqdp->mqd_tcd = NULL; + (void) pthread_cancel( + tcdp->tcd_server_id); + } + sig_mutex_unlock(&tcdp->tcd_lock); + } + mqhp->mq_ntype = 0; + mqhp->mq_des = 0; + } else { + /* notification is not set for this descriptor */ + errno = EBUSY; + goto bad; + } + } else { /* register notification with this process */ + switch (ntype = sigevp->sigev_notify) { + case SIGEV_THREAD: + userval = sigevp->sigev_value.sival_ptr; + port = -1; + break; + case SIGEV_PORT: + pn = sigevp->sigev_value.sival_ptr; + userval = pn->portnfy_user; + port = pn->portnfy_port; + if (fstat64(port, &statb) != 0 || + !S_ISPORT(statb.st_mode)) { + errno = EBADF; + goto bad; + } + (void) memset(&sigevent, 0, sizeof (sigevent)); + sigevent.sigev_notify = SIGEV_PORT; + sigevp = &sigevent; + break; + } + switch (ntype) { + case SIGEV_NONE: + mq_siginfo.si_signo = 0; + mq_siginfo.si_code = SI_MESGQ; + break; + case SIGEV_SIGNAL: + mq_siginfo.si_signo = sigevp->sigev_signo; + mq_siginfo.si_value = sigevp->sigev_value; + mq_siginfo.si_code = SI_MESGQ; + break; + case SIGEV_THREAD: + if ((tcdp = mqdp->mqd_tcd) != NULL && + cancel_if_necessary(tcdp, sigevp)) + mqdp->mqd_tcd = NULL; + /* FALLTHROUGH */ + case SIGEV_PORT: + if ((tcdp = mqdp->mqd_tcd) == NULL) { + /* we must create a spawner thread */ + tcdp = setup_sigev_handler(sigevp, MQ); + if (tcdp == NULL) { + errno = EBADF; + goto bad; + } + tcdp->tcd_msg_enabled = 0; + tcdp->tcd_msg_closing = 0; + tcdp->tcd_msg_avail = &mqhp->mq_spawner; + if (launch_spawner(tcdp) != 0) { + free_sigev_handler(tcdp); + goto bad; + } + mqdp->mqd_tcd = tcdp; + } + mq_siginfo.si_signo = 0; + mq_siginfo.si_code = SI_MESGQ; + break; + default: + errno = EINVAL; + goto bad; + } + + /* register notification */ + if (__signotify(SN_PROC, &mq_siginfo, &mqhp->mq_sigid) < 0) + goto bad; + mqhp->mq_ntype = ntype; + mqhp->mq_des = (uintptr_t)mqdp; + switch (ntype) { + case SIGEV_THREAD: + case SIGEV_PORT: + tcdp->tcd_port = port; + tcdp->tcd_msg_object = mqdp; + tcdp->tcd_msg_userval = userval; + sig_mutex_lock(&tcdp->tcd_lock); + tcdp->tcd_msg_enabled = ntype; + sig_mutex_unlock(&tcdp->tcd_lock); + (void) cond_broadcast(&tcdp->tcd_cv); + break; + } + } + + rval = 0; /* success */ +bad: + (void) mutex_unlock(&mqhp->mq_exclusive); + return (rval); +} + +int +_mq_setattr(mqd_t mqdes, const struct mq_attr *mqstat, struct mq_attr *omqstat) +{ + mqdes_t *mqdp = (mqdes_t *)mqdes; + mqhdr_t *mqhp; + uint_t flag = 0; + + if (!mq_is_valid(mqdp)) { + errno = EBADF; + return (-1); + } + + /* store current attributes */ + if (omqstat != NULL) { + int count; + + mqhp = mqdp->mqd_mq; + omqstat->mq_flags = mqdp->mqd_mqdn->mqdn_flags; + omqstat->mq_maxmsg = (long)mqhp->mq_maxmsg; + omqstat->mq_msgsize = (long)mqhp->mq_maxsz; + (void) sem_getvalue(&mqhp->mq_notempty, &count); + omqstat->mq_curmsgs = count; + } + + /* set description attributes */ + if ((mqstat->mq_flags & O_NONBLOCK) != 0) + flag = FNONBLOCK; + mqdp->mqd_mqdn->mqdn_flags = flag; + + return (0); +} + +int +_mq_getattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + mqdes_t *mqdp = (mqdes_t *)mqdes; + mqhdr_t *mqhp; + int count; + + if (!mq_is_valid(mqdp)) { + errno = EBADF; + return (-1); + } + + mqhp = mqdp->mqd_mq; + + mqstat->mq_flags = mqdp->mqd_mqdn->mqdn_flags; + mqstat->mq_maxmsg = (long)mqhp->mq_maxmsg; + mqstat->mq_msgsize = (long)mqhp->mq_maxsz; + (void) sem_getvalue(&mqhp->mq_notempty, &count); + mqstat->mq_curmsgs = count; + return (0); +} + +/* + * Cleanup after fork1() in the child process. + */ +void +postfork1_child_sigev_mq(void) +{ + thread_communication_data_t *tcdp; + mqdes_t *mqdp; + + for (mqdp = mq_list; mqdp; mqdp = mqdp->mqd_next) { + if ((tcdp = mqdp->mqd_tcd) != NULL) { + mqdp->mqd_tcd = NULL; + tcd_teardown(tcdp); + } + } +} diff --git a/usr/src/lib/libc/port/rt/pos4obj.c b/usr/src/lib/libc/port/rt/pos4obj.c new file mode 100644 index 0000000000..86f5a07595 --- /dev/null +++ b/usr/src/lib/libc/port/rt/pos4obj.c @@ -0,0 +1,482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include "mtlib.h" +#include <sys/types.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdlib.h> +#include <limits.h> +#include <pthread.h> +#include <thread.h> +#include <string.h> +#include <dirent.h> +#include <stdio.h> +#include <dlfcn.h> +#include <md5.h> +#include "pos4obj.h" + +#define HASHSTRLEN 32 + +static char *__pos4obj_name(const char *, const char *); +static void __pos4obj_md5toa(unsigned char *, unsigned char *); +static void __pos4obj_clean(char *); + +static char objroot[] = "/tmp/"; +static long int name_max = 0; + +int +__open_nc(const char *path, int oflag, mode_t mode) +{ + int canstate, val; + struct stat64 statbuf; + + /* + * Ensure path is not a symlink to somewhere else. This provides + * a modest amount of protection against easy security attacks. + */ + if (lstat64(path, &statbuf) == 0) { + if (S_ISLNK(statbuf.st_mode)) { + errno = EINVAL; + return (-1); + } + } + + (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &canstate); + val = open64(path, oflag, mode); + (void) pthread_setcancelstate(canstate, &canstate); + + return (val); +} + +int +__close_nc(int fildes) +{ + int canstate, val; + + (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &canstate); + val = close(fildes); + (void) pthread_setcancelstate(canstate, &canstate); + + return (val); +} + +/* + * This is to avoid loading libmd.so.1 unless we absolutely have to. + */ +typedef void (*md5_calc_t)(unsigned char *, unsigned char *, unsigned int); +static void *md5_handle = NULL; +static md5_calc_t real_md5_calc = NULL; +static mutex_t md5_lock = DEFAULTMUTEX; + +static void +load_md5_calc(void) +{ + lmutex_lock(&md5_lock); + if (real_md5_calc == NULL) { + md5_handle = dlopen("libmd.so.1", RTLD_LAZY); + if (md5_handle == NULL) + real_md5_calc = (md5_calc_t)(-1); + else { + real_md5_calc = + (md5_calc_t)dlsym(md5_handle, "md5_calc"); + if (real_md5_calc == NULL) { + (void) dlclose(md5_handle); + md5_handle = NULL; + real_md5_calc = (md5_calc_t)(-1); + } + } + } + lmutex_unlock(&md5_lock); +} + +static char * +__pos4obj_name(const char *path, const char *type) +{ + int shortpath = 1; + int olderrno; + size_t len; + char *dfile; + unsigned char hashbuf[HASHSTRLEN + 1]; + unsigned char md5_digest[MD5_DIGEST_LENGTH]; + + /* + * If the path is path_max - strlen(type) characters or less, + * the name of the file to use will be the path prefixed by + * the type. + * + * In the special case where the path is longer than + * path_max - strlen(type) characters, we create a string based on the + * MD5 hash of the path. We prefix that string with a '.' to + * make it obscure, and create a directory in objroot with + * that name. In that directory, we create a directory named + * after the type of object requested. Inside the type + * directory, the filename will be the path of the object. This + * prevents collisions in all namespaces. + * + * Example: + * Let objroot = "/tmp/", path = "/<longpath>", and type = ".MQD" + * Let the MD5 hash of "<longpath>" = "<hash>" + * + * The desired file is /tmp/.<hash>/.MQD/<longpath> + */ + + /* + * Do not include the leading '/' in the path length. + * Assumes __pos4obj_check(path) has already been called. + */ + if ((strlen(path) - 1) > (name_max - strlen(type))) + shortpath = 0; + + if (shortpath) { + /* + * strlen(path) includes leading slash as space for NUL. + */ + len = strlen(objroot) + strlen(type) + strlen(path); + } else { + /* + * Long path name. Add 3 for extra '/', '.' and '\0' + */ + len = strlen(objroot) + HASHSTRLEN + strlen(type) + + strlen(path) + 3; + } + + if ((dfile = malloc(len)) == NULL) + return (NULL); + + (void) memset(dfile, 0, len); + (void) strcpy(dfile, objroot); + + if (shortpath) { + (void) strcat(dfile, type); + (void) strcat(dfile, path + 1); + return (dfile); + } + + /* + * If we can successfully load it, call md5_calc(). + * Otherwise, (this "can't happen") return NULL. + */ + if (real_md5_calc == NULL) + load_md5_calc(); + if (real_md5_calc == (md5_calc_t)(-1)) { + free(dfile); + return (NULL); + } + + real_md5_calc(md5_digest, (unsigned char *)path + 1, strlen(path + 1)); + __pos4obj_md5toa(hashbuf, md5_digest); + (void) strcat(dfile, "."); + (void) strcat(dfile, (const char *)hashbuf); + + /* + * Errno must be preserved across the following calls to + * mkdir. This needs to be done to prevent incorrect error + * reporting in certain cases. When we attempt to open a + * non-existent object without the O_CREAT flag, it will + * always create a lock file first. The lock file is created + * and then the open is attempted, but fails with ENOENT. The + * lock file is then destroyed. In the following code path, we + * are finding the absolute path to the lock file after + * already having attempted the open (which set errno to + * ENOENT). The following calls to mkdir will return -1 and + * set errno to EEXIST, since the hash and type directories + * were created when the lock file was created. The correct + * errno is the ENOENT from the attempted open of the desired + * object. + */ + olderrno = errno; + + /* + * Create hash directory. Use 777 permissions so everyone can use it. + */ + if (mkdir(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == 0) { + if (chmod(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == -1) { + free(dfile); + return (NULL); + } + } else { + if (errno != EEXIST) { + free(dfile); + return (NULL); + } + } + + (void) strcat(dfile, "/"); + (void) strcat(dfile, type); + + /* + * Create directory for requested type. Use 777 perms so everyone + * can use it. + */ + if (mkdir(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == 0) { + if (chmod(dfile, S_IRWXU|S_IRWXG|S_IRWXO) == -1) { + free(dfile); + return (NULL); + } + } else { + if (errno != EEXIST) { + free(dfile); + return (NULL); + } + } + + errno = olderrno; + (void) strcat(dfile, path); + return (dfile); +} + +/* + * Takes a 128-bit MD5 digest and transforms to a sequence of 32 ASCII + * characters. Output is the hexadecimal representation of the digest. + * + * The output buffer must be at least HASHSTRLEN + 1 characters + * long. HASHSTRLEN is the size of the MD5 digest (128 bits) + * divided by the number of bits used per char of output (4). The + * extra character at the end is for the NUL terminating character. + */ + +static void +__pos4obj_md5toa(unsigned char *dest, unsigned char *src) +{ + int i; + uint32_t *p; + + /* LINTED pointer cast may result in improper alignment */ + p = (uint32_t *)src; + + for (i = 0; i < (MD5_DIGEST_LENGTH / 4); i++) + (void) snprintf((char *)dest + (i * 8), 9, "%.8x", *p++); + + dest[HASHSTRLEN] = '\0'; +} + +/* + * This open function assume that there is no simultaneous + * open/unlink operation is going on. The caller is supposed + * to ensure that both open in O_CREAT mode happen atomically. + * It returns the crflag as 1 if file is created else 0. + */ +int +__pos4obj_open(const char *name, char *type, int oflag, + mode_t mode, int *crflag) +{ + int fd; + char *dfile; + + errno = 0; + *crflag = 0; + + if ((dfile = __pos4obj_name(name, type)) == NULL) { + return (-1); + } + + if (!(oflag & O_CREAT)) { + if ((fd = __open_nc(dfile, oflag, mode)) == -1) + __pos4obj_clean(dfile); + + free(dfile); + return (fd); + } + + /* + * We need to make sure that crflag is set iff we actually create + * the file. We do this by or'ing in O_EXCL, and attempting an + * open. If that fails with an EEXIST, and O_EXCL wasn't specified + * by the caller, then the file seems to exist; we'll try an + * open with O_CREAT cleared. If that succeeds, then the file + * did indeed exist. If that fails with an ENOENT, however, the + * file was removed between the opens; we need to take another + * lap. + */ + for (;;) { + if ((fd = __open_nc(dfile, (oflag | O_EXCL), mode)) == -1) { + if (errno == EEXIST && !(oflag & O_EXCL)) { + fd = __open_nc(dfile, oflag & ~O_CREAT, mode); + + if (fd == -1 && errno == ENOENT) + continue; + break; + } + } else { + *crflag = 1; + } + break; + } + + free(dfile); + return (fd); +} + + +int +__pos4obj_unlink(const char *name, const char *type) +{ + int err; + char *dfile; + + if ((dfile = __pos4obj_name(name, type)) == NULL) { + return (-1); + } + + err = unlink(dfile); + + __pos4obj_clean(dfile); + + free(dfile); + + return (err); +} + +/* + * This function opens the lock file for each named object + * the presence of this file in the file system is the lock + */ +int +__pos4obj_lock(const char *name, const char *ltype) +{ + char *dfile; + int fd; + int limit = 64; + + if ((dfile = __pos4obj_name(name, ltype)) == NULL) { + return (-1); + } + + while (limit-- > 0) { + if ((fd = __open_nc(dfile, O_RDWR | O_CREAT | O_EXCL, 0666)) + < 0) { + if (errno != EEXIST) + break; + (void) sleep(1); + continue; + } + + (void) __close_nc(fd); + free(dfile); + return (1); + } + + free(dfile); + return (-1); +} + +/* + * Unlocks the file by unlinking it from the filesystem + */ +int +__pos4obj_unlock(const char *path, const char *type) +{ + return (__pos4obj_unlink(path, type)); +} + +/* + * Removes unused hash and type directories that may exist in specified path. + */ +static void +__pos4obj_clean(char *path) +{ + char *p; + int olderrno; + + /* + * path is either + * 1) /<objroot>/<type><path> or + * 2) /<objroot>/.<hash>/<type>/<path> + * + * In case 1, there is nothing to clean. + * + * Detect case 2 by looking for a '/' after /objroot/ and + * remove the two trailing directories, if empty. + */ + if (strchr(path + strlen(objroot), '/') == NULL) + return; + + /* + * Preserve errno across calls to rmdir. See block comment in + * __pos4obj_name() for explanation. + */ + olderrno = errno; + + if ((p = strrchr(path, '/')) == NULL) + return; + *p = '\0'; + + (void) rmdir(path); + + if ((p = strrchr(path, '/')) == NULL) + return; + *p = '\0'; + + (void) rmdir(path); + + errno = olderrno; +} + + +/* + * Check that path starts with a /, does not contain a / within it + * and is not longer than PATH_MAX or NAME_MAX + */ +int +__pos4obj_check(const char *path) +{ + long int i; + + /* + * This assumes that __pos4obj_check() is called before + * any of the other functions in this file + */ + if (name_max == 0 || name_max == -1) { + name_max = pathconf(objroot, _PC_NAME_MAX); + if (name_max == -1) + return (-1); + } + + if (*path++ != '/') { + errno = EINVAL; + return (-1); + } + + for (i = 0; *path != '\0'; i++) { + if (*path++ == '/') { + errno = EINVAL; + return (-1); + } + } + + if (i > PATH_MAX || i > name_max) { + errno = ENAMETOOLONG; + return (-1); + } + + return (0); +} diff --git a/usr/src/lib/libc/port/rt/pos4obj.h b/usr/src/lib/libc/port/rt/pos4obj.h new file mode 100644 index 0000000000..609a43f64c --- /dev/null +++ b/usr/src/lib/libc/port/rt/pos4obj.h @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _POS4OBJ_H +#define _POS4OBJ_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * pos4obj.h - Header file for POSIX.4 related object names + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* flags used to indicate current state of open */ +#define DFILE_CREATE 0x01 +#define DFILE_OPEN 0x02 +#define ALLOC_MEM 0x04 +#define DFILE_MMAP 0x08 +#define PFILE_CREATE 0x10 +#define NFILE_CREATE 0x20 +#define MQDNP_MMAP 0x40 + +/* semaphore object types - used in constructing file name */ +#define SEM_DATA_TYPE ".SEMD" +#define SEM_LOCK_TYPE ".SEML" + +/* message queue object types - used in constructing file name */ +#define MQ_DATA_TYPE ".MQD" +#define MQ_PERM_TYPE ".MQP" +#define MQ_DSCN_TYPE ".MQN" +#define MQ_LOCK_TYPE ".MQL" + +/* shared memory object types - used in constructing file name */ +#define SHM_DATA_TYPE ".SHMD" +#define SHM_LOCK_TYPE ".SHML" + +/* functions defined related to object names in POSIX.4 */ +extern int __pos4obj_lock(const char *, const char *); +extern int __pos4obj_unlock(const char *, const char *); +extern int __pos4obj_unlink(const char *, const char *); +extern int __pos4obj_open(const char *, char *, int, mode_t, int *); +extern int __pos4obj_check(const char *); + +/* non-cancelable file operations */ +int __open_nc(const char *, int, mode_t); +int __close_nc(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _POS4OBJ_H */ diff --git a/usr/src/lib/libc/port/rt/sched.c b/usr/src/lib/libc/port/rt/sched.c new file mode 100644 index 0000000000..58b793f2e2 --- /dev/null +++ b/usr/src/lib/libc/port/rt/sched.c @@ -0,0 +1,552 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include "mtlib.h" +#include <sys/types.h> +#include <sched.h> +#include <errno.h> +#include <limits.h> +#include <unistd.h> +#include <sys/priocntl.h> +#include <sys/rtpriocntl.h> +#include <sys/tspriocntl.h> +#include <sys/rt.h> +#include <sys/ts.h> +#include <thread.h> +#include <string.h> +#include <stdlib.h> +#include "rtsched.h" + +/* + * The following variables are used for caching information + * for priocntl scheduling classes. + */ +struct pcclass ts_class; +struct pcclass rt_class; +struct pcclass ia_class; +struct pcclass sys_class; + +static rtdpent_t *rt_dptbl; /* RT class parameter table */ + +typedef struct { /* type definition for generic class-specific parameters */ + int pc_clparms[PC_CLINFOSZ]; +} pc_clparms_t; + +static int map_gp_to_rtpri(pri_t); + +/* + * cache priocntl information on scheduling classes by policy + */ +int +get_info_by_policy(int policy) +{ + char *pccname; + struct pcclass *pccp; + + if (policy < 0) { + errno = EINVAL; + return (-1); + } + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + pccp = &rt_class; + pccname = "RT"; + break; + case SCHED_OTHER: + pccp = &ts_class; + pccname = "TS"; + break; + case SCHED_SYS: + pccp = &sys_class; + pccname = "sys"; + break; + case SCHED_IA: + pccp = &ia_class; + pccname = "IA"; + break; + default: + return (policy); + } + if (pccp->pcc_state != 0) { + if (pccp->pcc_state < 0) + errno = ENOSYS; + return (pccp->pcc_state); + } + + /* get class's info */ + (void) strcpy(pccp->pcc_info.pc_clname, pccname); + if (policy == SCHED_SYS) + pccp->pcc_info.pc_cid = 0; + else if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&(pccp->pcc_info)) < 0) + return (-1); + + if (policy == SCHED_FIFO || policy == SCHED_RR) { + pcadmin_t pcadmin; + rtadmin_t rtadmin; + size_t rtdpsize; + + /* get RT class dispatch table in rt_dptbl */ + pcadmin.pc_cid = rt_class.pcc_info.pc_cid; + pcadmin.pc_cladmin = (caddr_t)&rtadmin; + rtadmin.rt_cmd = RT_GETDPSIZE; + if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0) + return (-1); + rtdpsize = (size_t)(rtadmin.rt_ndpents * sizeof (rtdpent_t)); + if (rt_dptbl == NULL && + (rt_dptbl = lmalloc(rtdpsize)) == NULL) { + errno = EAGAIN; + return (-1); + } + rtadmin.rt_dpents = rt_dptbl; + rtadmin.rt_cmd = RT_GETDPTBL; + if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0) + return (-1); + pccp->pcc_primin = 0; + pccp->pcc_primax = ((rtinfo_t *)rt_class.pcc_info.pc_clinfo)-> + rt_maxpri; + } else if (policy == SCHED_OTHER) { + pri_t prio; + + prio = ((tsinfo_t *)ts_class.pcc_info.pc_clinfo)->ts_maxupri/3; + pccp->pcc_primin = -prio; + pccp->pcc_primax = prio; + } else { + /* non-RT scheduling class */ + pcpri_t pcpri; + + /* need RT class info before we can translate priorities */ + if (get_info_by_policy(SCHED_FIFO) < 0) + return (-1); + /* + * get class's global priority's min, max, and + * translate them into RT priority level (index) via rt_dptbl. + */ + pcpri.pc_cid = pccp->pcc_info.pc_cid; + if (priocntl(0, 0, PC_GETPRIRANGE, (caddr_t)&pcpri) < 0) + return (-1); + pccp->pcc_primax = map_gp_to_rtpri(pcpri.pc_clpmax); + pccp->pcc_primin = map_gp_to_rtpri(pcpri.pc_clpmin); + } + + pccp->pcc_state = 1; + return (1); +} + +/* + * Translate global scheduling priority to RT class's user priority. + * Use the gp values in the rt_dptbl to do a reverse mapping + * of a given gpri value relative to the index range of rt_dptbl. + */ +static int +map_gp_to_rtpri(pri_t gpri) +{ + rtdpent_t *rtdp; + pri_t pri; + + if (gpri <= rt_dptbl[rt_class.pcc_primin].rt_globpri) { + pri = gpri - rt_dptbl[rt_class.pcc_primin].rt_globpri + \ + rt_class.pcc_primin; + } else if (gpri >= rt_dptbl[rt_class.pcc_primax].rt_globpri) { + pri = gpri - rt_dptbl[rt_class.pcc_primax].rt_globpri + \ + rt_class.pcc_primax; + } else { + pri = rt_class.pcc_primin + 1; + for (rtdp = rt_dptbl+1; rtdp->rt_globpri < gpri; ++rtdp, ++pri) + ; + if (rtdp->rt_globpri > gpri) + --pri; + } + + return (pri); +} + +/* + * Translate RT class's user priority to global scheduling priority. + */ +pri_t +map_rtpri_to_gp(pri_t pri) +{ + rtdpent_t *rtdp; + pri_t gpri; + + if (rt_class.pcc_state == 0) + (void) get_info_by_policy(SCHED_FIFO); + + /* First case is the default case, other two are seldomly taken */ + if (pri <= rt_dptbl[rt_class.pcc_primin].rt_globpri) { + gpri = pri + rt_dptbl[rt_class.pcc_primin].rt_globpri - + rt_class.pcc_primin; + } else if (pri >= rt_dptbl[rt_class.pcc_primax].rt_globpri) { + gpri = pri + rt_dptbl[rt_class.pcc_primax].rt_globpri - + rt_class.pcc_primax; + } else { + gpri = rt_dptbl[rt_class.pcc_primin].rt_globpri + 1; + for (rtdp = rt_dptbl+1; rtdp->rt_globpri < pri; ++rtdp, ++gpri) + ; + if (rtdp->rt_globpri > pri) + --gpri; + } + return (gpri); +} + +static int +get_info_by_class(id_t classid) +{ + pcinfo_t pcinfo; + + /* determine if we already know this classid */ + if (rt_class.pcc_state > 0 && rt_class.pcc_info.pc_cid == classid) + return (1); + if (ts_class.pcc_state > 0 && ts_class.pcc_info.pc_cid == classid) + return (1); + if (sys_class.pcc_state > 0 && sys_class.pcc_info.pc_cid == classid) + return (1); + if (ia_class.pcc_state > 0 && ia_class.pcc_info.pc_cid == classid) + return (1); + + pcinfo.pc_cid = classid; + if (priocntl(0, 0, PC_GETCLINFO, (caddr_t)&pcinfo) < 0) { + if (classid == 0) /* no kernel info for sys class */ + return (get_info_by_policy(SCHED_SYS)); + return (-1); + } + + if (rt_class.pcc_state == 0 && strcmp(pcinfo.pc_clname, "RT") == 0) + return (get_info_by_policy(SCHED_FIFO)); + if (ts_class.pcc_state == 0 && strcmp(pcinfo.pc_clname, "TS") == 0) + return (get_info_by_policy(SCHED_OTHER)); + if (ia_class.pcc_state == 0 && strcmp(pcinfo.pc_clname, "IA") == 0) + return (get_info_by_policy(SCHED_IA)); + + return (1); +} + +int +sched_setparam(pid_t pid, const struct sched_param *param) +{ + pri_t prio = param->sched_priority; + pcparms_t pcparm; + tsparms_t *tsp; + tsinfo_t *tsi; + int scale; + + if (pid < 0) { + errno = ESRCH; + return (-1); + } + if (pid == 0) + pid = P_MYID; + + /* get process's current scheduling policy */ + pcparm.pc_cid = PC_CLNULL; + if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1) + return (-1); + if (get_info_by_class(pcparm.pc_cid) < 0) + return (-1); + + if (pcparm.pc_cid == rt_class.pcc_info.pc_cid) { + /* SCHED_FIFO or SCHED_RR policy */ + if (prio < rt_class.pcc_primin || prio > rt_class.pcc_primax) { + errno = EINVAL; + return (-1); + } + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + } else if (pcparm.pc_cid == ts_class.pcc_info.pc_cid) { + /* SCHED_OTHER policy */ + tsi = (tsinfo_t *)ts_class.pcc_info.pc_clinfo; + scale = tsi->ts_maxupri; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_uprilim = tsp->ts_upri = -(scale * prio) / 20; + } else { + /* + * policy is not defined by POSIX.4. + * just pass parameter data through to priocntl. + * param should contain an image of class-specific parameters + * (after the sched_priority member). + */ + *((pc_clparms_t *)pcparm.pc_clparms) = + *((pc_clparms_t *)(&(param->sched_priority)+1)); + } + + return ((int)priocntl(P_PID, pid, PC_SETPARMS, (caddr_t)&pcparm)); +} + +int +sched_getparam(pid_t pid, struct sched_param *param) +{ + pcparms_t pcparm; + pri_t prio; + int scale; + tsinfo_t *tsi; + + if (pid < 0) { + errno = ESRCH; + return (-1); + } + if (pid == 0) + pid = P_MYID; + + pcparm.pc_cid = PC_CLNULL; + if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1) + return (-1); + if (get_info_by_class(pcparm.pc_cid) < 0) + return (-1); + + if (pcparm.pc_cid == rt_class.pcc_info.pc_cid) { + param->sched_priority = + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + } else if (pcparm.pc_cid == ts_class.pcc_info.pc_cid) { + param->sched_nicelim = + ((tsparms_t *)pcparm.pc_clparms)->ts_uprilim; + prio = param->sched_nice = + ((tsparms_t *)pcparm.pc_clparms)->ts_upri; + tsi = (tsinfo_t *)ts_class.pcc_info.pc_clinfo; + scale = tsi->ts_maxupri; + if (scale == 0) + param->sched_priority = 0; + else + param->sched_priority = -(prio * 20) / scale; + } else { + /* + * policy is not defined by POSIX.4 + * just return a copy of pcparams_t image in param. + */ + *((pc_clparms_t *)(&(param->sched_priority)+1)) = + *((pc_clparms_t *)pcparm.pc_clparms); + param->sched_priority = + sched_get_priority_min((int)(pcparm.pc_cid + _SCHED_NEXT)); + } + + return (0); +} + +int +sched_setscheduler(pid_t pid, int policy, const struct sched_param *param) +{ + pri_t prio = param->sched_priority; + pcparms_t pcparm; + int oldpolicy; + tsinfo_t *tsi; + tsparms_t *tsp; + int scale; + + if ((oldpolicy = sched_getscheduler(pid)) < 0) + return (-1); + + if (pid == 0) + pid = P_MYID; + + if (get_info_by_policy(policy) < 0) { + errno = EINVAL; + return (-1); + } + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + if (prio < rt_class.pcc_primin || prio > rt_class.pcc_primax) { + errno = EINVAL; + return (-1); + } + pcparm.pc_cid = rt_class.pcc_info.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + (policy == SCHED_RR ? RT_TQDEF : RT_TQINF); + break; + + case SCHED_OTHER: + pcparm.pc_cid = ts_class.pcc_info.pc_cid; + tsi = (tsinfo_t *)ts_class.pcc_info.pc_clinfo; + scale = tsi->ts_maxupri; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_uprilim = tsp->ts_upri = -(scale * prio) / 20; + break; + + default: + switch (policy) { + case SCHED_SYS: + pcparm.pc_cid = sys_class.pcc_info.pc_cid; + break; + case SCHED_IA: + pcparm.pc_cid = ia_class.pcc_info.pc_cid; + break; + default: + pcparm.pc_cid = policy - _SCHED_NEXT; + break; + } + /* + * policy is not defined by POSIX.4. + * just pass parameter data through to priocntl. + * param should contain an image of class-specific parameters + * (after the sched_priority member). + */ + *((pc_clparms_t *)pcparm.pc_clparms) = + *((pc_clparms_t *)&(param->sched_priority)+1); + } + + /* setting scheduling policy & parameters for the process */ + if (priocntl(P_PID, pid, PC_SETPARMS, (caddr_t)&pcparm) == -1) + return (-1); + + return (oldpolicy); +} + +int +sched_getscheduler(pid_t pid) +{ + pcparms_t pcparm; + int policy; + + if (pid < 0) { + errno = ESRCH; + return (-1); + } + if (pid == 0) + pid = P_MYID; + + /* get scheduling policy & parameters for the process */ + pcparm.pc_cid = PC_CLNULL; + if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1) + return (-1); + if (get_info_by_class(pcparm.pc_cid) < 0) + return (-1); + + if (pcparm.pc_cid == rt_class.pcc_info.pc_cid) + policy = ((((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? SCHED_FIFO : SCHED_RR)); + else if (pcparm.pc_cid == ts_class.pcc_info.pc_cid) + policy = SCHED_OTHER; + else if (pcparm.pc_cid == sys_class.pcc_info.pc_cid) + policy = SCHED_SYS; + else if (pcparm.pc_cid == ia_class.pcc_info.pc_cid) + policy = SCHED_IA; + else { + /* + * policy is not defined by POSIX.4 + * return a unique dot4 policy id. + */ + policy = (int)(_SCHED_NEXT + pcparm.pc_cid); + } + + return (policy); +} + +int +sched_yield(void) +{ + thr_yield(); + return (0); +} + +int +sched_get_priority_max(int policy) +{ + pcpri_t pcpri; + + if (get_info_by_policy(policy) < 0) + return (-1); + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return (rt_class.pcc_primax); + else if (policy == SCHED_OTHER) + return (ts_class.pcc_primax); + else if (policy == SCHED_SYS) + return (sys_class.pcc_primax); + else if (policy == SCHED_IA) + return (ia_class.pcc_primax); + else { /* policy not in POSIX.4 */ + pcpri.pc_cid = policy - _SCHED_NEXT; + if (priocntl(0, 0, PC_GETPRIRANGE, (caddr_t)&pcpri) == 0) + return (map_gp_to_rtpri(pcpri.pc_clpmax)); + } + + errno = EINVAL; + return (-1); +} + +int +sched_get_priority_min(int policy) +{ + pcpri_t pcpri; + + if (get_info_by_policy(policy) < 0) + return (-1); + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return (rt_class.pcc_primin); + else if (policy == SCHED_OTHER) + return (ts_class.pcc_primin); + else if (policy == SCHED_SYS) + return (sys_class.pcc_primin); + else if (policy == SCHED_IA) + return (ia_class.pcc_primin); + else { /* policy not in POSIX.4 */ + pcpri.pc_cid = policy - _SCHED_NEXT; + if (priocntl(0, 0, PC_GETPRIRANGE, (caddr_t)&pcpri) == 0) + return (map_gp_to_rtpri(pcpri.pc_clpmin)); + } + + errno = EINVAL; + return (-1); +} + +int +sched_rr_get_interval(pid_t pid, timespec_t *interval) +{ + pcparms_t pcparm; + + if (pid < 0) { + errno = ESRCH; + return (-1); + } + if (pid == 0) + pid = P_MYID; + + if (get_info_by_policy(SCHED_RR) < 0) + return (-1); + + pcparm.pc_cid = PC_CLNULL; + if (priocntl(P_PID, pid, PC_GETPARMS, (caddr_t)&pcparm) == -1) + return (-1); + + if (pcparm.pc_cid == rt_class.pcc_info.pc_cid && + (((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF)) { + /* SCHED_RR */ + interval->tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs; + interval->tv_nsec = + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs; + return (0); + } + + errno = EINVAL; + return (-1); +} diff --git a/usr/src/lib/libc/port/rt/sem.c b/usr/src/lib/libc/port/rt/sem.c new file mode 100644 index 0000000000..af3bdcc06a --- /dev/null +++ b/usr/src/lib/libc/port/rt/sem.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#pragma weak sem_open = _sem_open +#pragma weak sem_close = _sem_close +#pragma weak sem_unlink = _sem_unlink +#pragma weak sem_init = _sem_init +#pragma weak sem_destroy = _sem_destroy +#pragma weak sem_wait = _sem_wait +#pragma weak sem_timedwait = _sem_timedwait +#pragma weak sem_reltimedwait_np = _sem_reltimedwait_np +#pragma weak sem_trywait = _sem_trywait +#pragma weak sem_post = _sem_post +#pragma weak sem_getvalue = _sem_getvalue + +#include "synonyms.h" +#include "mtlib.h" +#include <sys/types.h> +#include <semaphore.h> +#include <synch.h> +#include <errno.h> +#include <stdarg.h> +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <unistd.h> +#include <thread.h> +#include "pos4obj.h" + +typedef struct semaddr { + struct semaddr *sad_next; /* next in the link */ + char sad_name[PATH_MAX + 1]; /* name of sem object */ + sem_t *sad_addr; /* mmapped address of semaphore */ + ino64_t sad_inode; /* inode # of the mmapped file */ +} semaddr_t; + +static long semvaluemax = 0; +static semaddr_t *semheadp = NULL; +static mutex_t semlock = DEFAULTMUTEX; + +sem_t * +_sem_open(const char *path, int oflag, /* mode_t mode, int value */ ...) +{ + va_list ap; + mode_t crmode = 0; + sem_t *sem = NULL; + struct stat64 statbuf; + semaddr_t *next = NULL; + int fd = 0; + int error = 0; + int cr_flag = 0; + uint_t value = 0; + + if (__pos4obj_check(path) == -1) + return (SEM_FAILED); + + /* acquire semaphore lock to have atomic operation */ + if (__pos4obj_lock(path, SEM_LOCK_TYPE) < 0) + return (SEM_FAILED); + + /* modify oflag to have RDWR and filter CREATE mode only */ + oflag = (oflag & (O_CREAT|O_EXCL)) | (O_RDWR); + if (oflag & O_CREAT) { + if (semvaluemax == 0 && + (semvaluemax = _sysconf(_SC_SEM_VALUE_MAX)) <= 0) + semvaluemax = -1; + va_start(ap, oflag); + crmode = va_arg(ap, mode_t); + value = va_arg(ap, uint_t); + va_end(ap); + /* check value < the max for a named semaphore */ + if (semvaluemax < 0 || + (ulong_t)value > (ulong_t)semvaluemax) { + errno = EINVAL; + goto out; + } + } + + errno = 0; + + if ((fd = __pos4obj_open(path, SEM_DATA_TYPE, + oflag, crmode, &cr_flag)) < 0) + goto out; + + if (cr_flag) + cr_flag = DFILE_CREATE | DFILE_OPEN; + else + cr_flag = DFILE_OPEN; + + /* find out inode # for the opened file */ + if (fstat64(fd, &statbuf) < 0) + goto out; + + /* if created, acquire total_size in the file */ + if ((cr_flag & DFILE_CREATE) != 0) { + if (ftruncate64(fd, (off64_t)sizeof (sem_t)) < 0) + goto out; + } else { + /* + * if this semaphore has already been opened, inode + * will indicate then return the same semaphore address + */ + lmutex_lock(&semlock); + for (next = semheadp; next != NULL; next = next->sad_next) { + if (statbuf.st_ino == next->sad_inode && + strcmp(path, next->sad_name) == 0) { + (void) __close_nc(fd); + lmutex_unlock(&semlock); + (void) __pos4obj_unlock(path, SEM_LOCK_TYPE); + return (next->sad_addr); + } + } + lmutex_unlock(&semlock); + } + + + /* new sem descriptor to be allocated and new address to be mapped */ + if ((next = malloc(sizeof (semaddr_t))) == NULL) { + errno = ENOMEM; + goto out; + } + cr_flag |= ALLOC_MEM; + + /* LINTED */ + sem = (sem_t *)mmap64(NULL, sizeof (sem_t), PROT_READ|PROT_WRITE, + MAP_SHARED, fd, (off64_t)0); + (void) __close_nc(fd); + cr_flag &= ~DFILE_OPEN; + if (sem == MAP_FAILED) + goto out; + cr_flag |= DFILE_MMAP; + + /* if created, initialize */ + if (cr_flag & DFILE_CREATE) { + error = sema_init((sema_t *)sem, value, USYNC_PROCESS, 0); + if (error) { + errno = error; + goto out; + } + } + + if (__pos4obj_unlock(path, SEM_LOCK_TYPE) == 0) { + /* add to the list pointed by semheadp */ + lmutex_lock(&semlock); + next->sad_next = semheadp; + semheadp = next; + next->sad_addr = sem; + next->sad_inode = statbuf.st_ino; + (void) strcpy(next->sad_name, path); + lmutex_unlock(&semlock); + return (sem); + } + /* fall into the error case */ +out: + error = errno; + if ((cr_flag & DFILE_OPEN) != 0) + (void) __close_nc(fd); + if ((cr_flag & DFILE_CREATE) != 0) + (void) __pos4obj_unlink(path, SEM_DATA_TYPE); + if ((cr_flag & ALLOC_MEM) != 0) + free(next); + if ((cr_flag & DFILE_MMAP) != 0) + (void) munmap((caddr_t)sem, sizeof (sem_t)); + (void) __pos4obj_unlock(path, SEM_LOCK_TYPE); + errno = error; + return (SEM_FAILED); +} + +int +_sem_close(sem_t *sem) +{ + semaddr_t **next; + semaddr_t *freeit; + + lmutex_lock(&semlock); + for (next = &semheadp; (freeit = *next) != NULL; + next = &(freeit->sad_next)) { + if (freeit->sad_addr == sem) { + *next = freeit->sad_next; + lmutex_unlock(&semlock); + free(freeit); + return (munmap((caddr_t)sem, sizeof (sem_t))); + } + } + lmutex_unlock(&semlock); + errno = EINVAL; + return (-1); +} + +int +_sem_unlink(const char *path) +{ + int error; + int oerrno; + + if (__pos4obj_check(path) < 0) + return (-1); + + if (__pos4obj_lock(path, SEM_LOCK_TYPE) < 0) + return (-1); + + error = __pos4obj_unlink(path, SEM_DATA_TYPE); + + oerrno = errno; + + (void) __pos4obj_unlock(path, SEM_LOCK_TYPE); + + errno = oerrno; + + return (error); +} + +/* + * SUSV3 requires ("shall fail") an EINVAL failure for operations + * on invalid semaphores, including uninitialized unnamed semaphores. + * The best we can do is check that the magic number is correct. + * This is not perfect, but it allows the test suite to pass. + * (Standards bodies are filled with fools and idiots.) + */ +static int +sem_invalid(sem_t *sem) +{ + if (sem->sem_magic != SEMA_MAGIC) { + errno = EINVAL; + return (-1); + } + return (0); +} + +int +_sem_init(sem_t *sem, int pshared, uint_t value) +{ + int error; + + if ((error = sema_init((sema_t *)sem, value, + pshared ? USYNC_PROCESS : USYNC_THREAD, NULL)) != 0) { + errno = error; + return (-1); + } + return (0); +} + +int +_sem_destroy(sem_t *sem) +{ + int error; + + if (sem_invalid(sem)) + return (-1); + if ((error = sema_destroy((sema_t *)sem)) != 0) { + errno = error; + return (-1); + } + return (0); +} + +int +_sem_post(sem_t *sem) +{ + int error; + + if (sem_invalid(sem)) + return (-1); + if ((error = sema_post((sema_t *)sem)) != 0) { + errno = error; + return (-1); + } + return (0); +} + +int +_sem_wait(sem_t *sem) +{ + int error; + + if (sem_invalid(sem)) + return (-1); + if ((error = sema_wait((sema_t *)sem)) != 0) { + errno = error; + return (-1); + } + return (0); +} + +int +_sem_timedwait(sem_t *sem, const timespec_t *abstime) +{ + int error; + + if (sem_invalid(sem)) + return (-1); + if ((error = sema_timedwait((sema_t *)sem, abstime)) != 0) { + if (error == ETIME) + error = ETIMEDOUT; + errno = error; + return (-1); + } + return (0); +} + +int +_sem_reltimedwait_np(sem_t *sem, const timespec_t *reltime) +{ + int error; + + if (sem_invalid(sem)) + return (-1); + if ((error = sema_reltimedwait((sema_t *)sem, reltime)) != 0) { + if (error == ETIME) + error = ETIMEDOUT; + errno = error; + return (-1); + } + return (0); +} + +int +_sem_trywait(sem_t *sem) +{ + int error; + + if (sem_invalid(sem)) + return (-1); + if ((error = sema_trywait((sema_t *)sem)) != 0) { + if (error == EBUSY) + error = EAGAIN; + errno = error; + return (-1); + } + return (0); +} + +int +_sem_getvalue(sem_t *sem, int *sval) +{ + if (sem_invalid(sem)) + return (-1); + *sval = (int)sem->sem_count; + return (0); +} diff --git a/usr/src/lib/libc/port/rt/shm.c b/usr/src/lib/libc/port/rt/shm.c new file mode 100644 index 0000000000..53c59d9424 --- /dev/null +++ b/usr/src/lib/libc/port/rt/shm.c @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include <sys/types.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <limits.h> +#include <errno.h> +#include "pos4obj.h" + +int +shm_open(const char *path, int oflag, mode_t mode) +{ + int crflag; + int fd; + int flags; + + if (__pos4obj_check(path) == -1) + return (-1); + + /* acquire semaphore lock to have atomic operation */ + if (__pos4obj_lock(path, SHM_LOCK_TYPE) < 0) + return (-1); + + fd = __pos4obj_open(path, SHM_DATA_TYPE, oflag, mode, &crflag); + + if (fd < 0) { + (void) __pos4obj_unlock(path, SHM_LOCK_TYPE); + return (-1); + } + + if ((flags = fcntl(fd, F_GETFD)) < 0 || + fcntl(fd, F_SETFD, flags | FD_CLOEXEC) < 0) { + (void) __pos4obj_unlock(path, SHM_LOCK_TYPE); + (void) __close_nc(fd); + return (-1); + } + + /* relase semaphore lock operation */ + if (__pos4obj_unlock(path, SHM_LOCK_TYPE) < 0) { + (void) __close_nc(fd); + return (-1); + } + + return (fd); +} + +int +shm_unlink(const char *path) +{ + int oerrno; + int err; + + if (__pos4obj_check(path) < 0) + return (-1); + + if (__pos4obj_lock(path, SHM_LOCK_TYPE) < 0) + return (-1); + + err = __pos4obj_unlink(path, SHM_DATA_TYPE); + + oerrno = errno; + + (void) __pos4obj_unlock(path, SHM_LOCK_TYPE); + + errno = oerrno; + return (err); + +} diff --git a/usr/src/lib/libc/port/rt/sigev_thread.c b/usr/src/lib/libc/port/rt/sigev_thread.c new file mode 100644 index 0000000000..0ab6eaccdf --- /dev/null +++ b/usr/src/lib/libc/port/rt/sigev_thread.c @@ -0,0 +1,715 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include "thr_uberdata.h" +#include <sys/types.h> +#include <pthread.h> +#include <unistd.h> +#include <stdlib.h> +#include <thread.h> +#include <pthread.h> +#include <synch.h> +#include <port.h> +#include <signal.h> +#include <stdio.h> +#include <errno.h> +#include <stdarg.h> +#include <string.h> +#include <sys/aiocb.h> +#include <time.h> +#include <signal.h> +#include <fcntl.h> +#include "sigev_thread.h" + +/* + * There is but one spawner for all aio operations. + */ +thread_communication_data_t *sigev_aio_tcd = NULL; + +/* + * Set non-zero via _RT_DEBUG to enable debugging printf's. + */ +static int _rt_debug = 0; + +void +init_sigev_thread(void) +{ + char *ldebug; + + if ((ldebug = getenv("_RT_DEBUG")) != NULL) + _rt_debug = atoi(ldebug); +} + +/* + * Routine to print debug messages: + * If _rt_debug is set, printf the debug message to stderr + * with an appropriate prefix. + */ +/*PRINTFLIKE1*/ +static void +dprintf(const char *format, ...) +{ + if (_rt_debug) { + va_list alist; + + va_start(alist, format); + flockfile(stderr); + (void) fputs("DEBUG: ", stderr); + (void) vfprintf(stderr, format, alist); + funlockfile(stderr); + va_end(alist); + } +} + +/* + * The notify_thread() function can be used as the start function of a new + * thread but it is normally called from notifier(), below, in the context + * of a thread pool worker thread. It is used as the start function of a + * new thread only when individual pthread attributes differ from those + * that are common to all workers. This only occurs in the AIO case. + */ +static void * +notify_thread(void *arg) +{ + sigev_thread_data_t *stdp = arg; + void (*function)(union sigval) = stdp->std_func; + union sigval argument = stdp->std_arg; + + lfree(stdp, sizeof (*stdp)); + function(argument); + return (NULL); +} + +/* + * Thread pool interface to call the user-supplied notification function. + */ +static void +notifier(void *arg) +{ + (void) notify_thread(arg); +} + +/* + * This routine adds a new work request, described by function + * and argument, to the list of outstanding jobs. + * It returns 0 indicating success. A value != 0 indicates an error. + */ +static int +sigev_add_work(thread_communication_data_t *tcdp, + void (*function)(union sigval), union sigval argument) +{ + tpool_t *tpool = tcdp->tcd_poolp; + sigev_thread_data_t *stdp; + + if (tpool == NULL) + return (EINVAL); + if ((stdp = lmalloc(sizeof (*stdp))) == NULL) + return (errno); + stdp->std_func = function; + stdp->std_arg = argument; + if (tpool_dispatch(tpool, notifier, stdp) != 0) { + lfree(stdp, sizeof (*stdp)); + return (errno); + } + return (0); +} + +static void +sigev_destroy_pool(thread_communication_data_t *tcdp) +{ + if (tcdp->tcd_poolp != NULL) + tpool_abandon(tcdp->tcd_poolp); + tcdp->tcd_poolp = NULL; + + if (tcdp->tcd_subsystem == MQ) { + /* + * synchronize with del_sigev_mq() + */ + sig_mutex_lock(&tcdp->tcd_lock); + tcdp->tcd_server_id = 0; + if (tcdp->tcd_msg_closing) { + (void) cond_broadcast(&tcdp->tcd_cv); + sig_mutex_unlock(&tcdp->tcd_lock); + return; /* del_sigev_mq() will free the tcd */ + } + sig_mutex_unlock(&tcdp->tcd_lock); + } + + /* + * now delete everything + */ + free_sigev_handler(tcdp); +} + +/* + * timer_spawner(), mqueue_spawner(), and aio_spawner() are the main + * functions for the daemon threads that get the event(s) for the + * respective SIGEV_THREAD subsystems. There is one timer spawner for + * each timer_create(), one mqueue spawner for every mq_open(), and + * exactly one aio spawner for all aio requests. These spawners add + * work requests to be done by a pool of daemon worker threads. In case + * the event requires creation of a worker thread with different pthread + * attributes than those from the pool of workers, a new daemon thread + * with these attributes is spawned apart from the pool of workers. + * If the spawner fails to add work or fails to create an additional + * thread because of lacking resources, it puts the event back into + * the kernel queue and re-tries some time later. + */ + +void * +timer_spawner(void *arg) +{ + thread_communication_data_t *tcdp = (thread_communication_data_t *)arg; + port_event_t port_event; + + /* destroy the pool if we are cancelled */ + pthread_cleanup_push(sigev_destroy_pool, tcdp); + + for (;;) { + if (port_get(tcdp->tcd_port, &port_event, NULL) != 0) { + dprintf("port_get on port %d failed with %d <%s>\n", + tcdp->tcd_port, errno, strerror(errno)); + break; + } + switch (port_event.portev_source) { + case PORT_SOURCE_TIMER: + break; + case PORT_SOURCE_ALERT: + if (port_event.portev_events != SIGEV_THREAD_TERM) + errno = EPROTO; + goto out; + default: + dprintf("port_get on port %d returned %u " + "(not PORT_SOURCE_TIMER)\n", + tcdp->tcd_port, port_event.portev_source); + errno = EPROTO; + goto out; + } + + tcdp->tcd_overruns = port_event.portev_events - 1; + if (sigev_add_work(tcdp, + tcdp->tcd_notif.sigev_notify_function, + tcdp->tcd_notif.sigev_value) != 0) + break; + /* wait until job is done before looking for another */ + tpool_wait(tcdp->tcd_poolp); + } +out: + pthread_cleanup_pop(1); + return (NULL); +} + +void * +mqueue_spawner(void *arg) +{ + thread_communication_data_t *tcdp = (thread_communication_data_t *)arg; + int ret = 0; + int ntype; + void (*function)(union sigval); + union sigval argument; + + /* destroy the pool if we are cancelled */ + pthread_cleanup_push(sigev_destroy_pool, tcdp); + + while (ret == 0) { + sig_mutex_lock(&tcdp->tcd_lock); + pthread_cleanup_push(sig_mutex_unlock, &tcdp->tcd_lock); + while ((ntype = tcdp->tcd_msg_enabled) == 0) + (void) sig_cond_wait(&tcdp->tcd_cv, &tcdp->tcd_lock); + pthread_cleanup_pop(1); + + while (sem_wait(tcdp->tcd_msg_avail) == -1) + continue; + + sig_mutex_lock(&tcdp->tcd_lock); + tcdp->tcd_msg_enabled = 0; + sig_mutex_unlock(&tcdp->tcd_lock); + + /* ASSERT(ntype == SIGEV_THREAD || ntype == SIGEV_PORT); */ + if (ntype == SIGEV_THREAD) { + function = tcdp->tcd_notif.sigev_notify_function; + argument.sival_ptr = tcdp->tcd_msg_userval; + ret = sigev_add_work(tcdp, function, argument); + } else { /* ntype == SIGEV_PORT */ + ret = _port_dispatch(tcdp->tcd_port, 0, PORT_SOURCE_MQ, + 0, (uintptr_t)tcdp->tcd_msg_object, + tcdp->tcd_msg_userval); + } + } + sig_mutex_unlock(&tcdp->tcd_lock); + + pthread_cleanup_pop(1); + return (NULL); +} + +void * +aio_spawner(void *arg) +{ + thread_communication_data_t *tcdp = (thread_communication_data_t *)arg; + int error = 0; + void (*function)(union sigval); + union sigval argument; + port_event_t port_event; + struct sigevent *sigevp; + timespec_t delta; + pthread_attr_t *attrp; + + /* destroy the pool if we are cancelled */ + pthread_cleanup_push(sigev_destroy_pool, tcdp); + + while (error == 0) { + if (port_get(tcdp->tcd_port, &port_event, NULL) != 0) { + error = errno; + dprintf("port_get on port %d failed with %d <%s>\n", + tcdp->tcd_port, error, strerror(error)); + break; + } + switch (port_event.portev_source) { + case PORT_SOURCE_AIO: + break; + case PORT_SOURCE_ALERT: + if (port_event.portev_events != SIGEV_THREAD_TERM) + errno = EPROTO; + goto out; + default: + dprintf("port_get on port %d returned %u " + "(not PORT_SOURCE_AIO)\n", + tcdp->tcd_port, port_event.portev_source); + errno = EPROTO; + goto out; + } + argument.sival_ptr = port_event.portev_user; + switch (port_event.portev_events) { + case AIOLIO: +#if !defined(_LP64) + case AIOLIO64: +#endif + sigevp = (struct sigevent *)port_event.portev_object; + function = sigevp->sigev_notify_function; + attrp = sigevp->sigev_notify_attributes; + break; + case AIOAREAD: + case AIOAWRITE: + case AIOFSYNC: + { + aiocb_t *aiocbp = + (aiocb_t *)port_event.portev_object; + function = aiocbp->aio_sigevent.sigev_notify_function; + attrp = aiocbp->aio_sigevent.sigev_notify_attributes; + break; + } +#if !defined(_LP64) + case AIOAREAD64: + case AIOAWRITE64: + case AIOFSYNC64: + { + aiocb64_t *aiocbp = + (aiocb64_t *)port_event.portev_object; + function = aiocbp->aio_sigevent.sigev_notify_function; + attrp = aiocbp->aio_sigevent.sigev_notify_attributes; + break; + } +#endif + default: + function = NULL; + attrp = NULL; + break; + } + + if (function == NULL) + error = EINVAL; + else if (_pthread_attr_equal(attrp, tcdp->tcd_attrp)) + error = sigev_add_work(tcdp, function, argument); + else { + /* + * The attributes don't match. + * Spawn a thread with the non-matching attributes. + */ + pthread_attr_t local_attr; + sigev_thread_data_t *stdp; + + if ((stdp = lmalloc(sizeof (*stdp))) == NULL) + error = ENOMEM; + else + error = _pthread_attr_clone(&local_attr, attrp); + + if (error == 0) { + (void) pthread_attr_setdetachstate( + &local_attr, PTHREAD_CREATE_DETACHED); + (void) _pthread_attr_setdaemonstate_np( + &local_attr, PTHREAD_CREATE_DAEMON_NP); + stdp->std_func = function; + stdp->std_arg = argument; + error = pthread_create(NULL, &local_attr, + notify_thread, stdp); + (void) pthread_attr_destroy(&local_attr); + } + if (error && stdp != NULL) + lfree(stdp, sizeof (*stdp)); + } + + if (error) { + dprintf("Cannot add work, error=%d <%s>.\n", + error, strerror(error)); + if (error == EAGAIN || error == ENOMEM) { + /* (Temporary) no resources are available. */ + if (_port_dispatch(tcdp->tcd_port, 0, + PORT_SOURCE_AIO, port_event.portev_events, + port_event.portev_object, + port_event.portev_user) != 0) + break; + error = 0; + delta.tv_sec = 0; + delta.tv_nsec = NANOSEC / 20; /* 50 msec */ + (void) nanosleep(&delta, NULL); + } + } + } +out: + pthread_cleanup_pop(1); + return (NULL); +} + +/* + * Allocate a thread_communication_data_t block. + */ +static thread_communication_data_t * +alloc_sigev_handler(subsystem_t caller) +{ + thread_communication_data_t *tcdp; + + if ((tcdp = lmalloc(sizeof (*tcdp))) != NULL) { + tcdp->tcd_subsystem = caller; + tcdp->tcd_port = -1; + (void) mutex_init(&tcdp->tcd_lock, USYNC_THREAD, NULL); + (void) cond_init(&tcdp->tcd_cv, USYNC_THREAD, NULL); + } + return (tcdp); +} + +/* + * Free a thread_communication_data_t block. + */ +void +free_sigev_handler(thread_communication_data_t *tcdp) +{ + if (tcdp->tcd_attrp) { + (void) pthread_attr_destroy(tcdp->tcd_attrp); + tcdp->tcd_attrp = NULL; + } + (void) memset(&tcdp->tcd_notif, 0, sizeof (tcdp->tcd_notif)); + + switch (tcdp->tcd_subsystem) { + case TIMER: + case AIO: + if (tcdp->tcd_port >= 0) + (void) close(tcdp->tcd_port); + break; + case MQ: + tcdp->tcd_msg_avail = NULL; + tcdp->tcd_msg_object = NULL; + tcdp->tcd_msg_userval = NULL; + tcdp->tcd_msg_enabled = 0; + break; + } + + lfree(tcdp, sizeof (*tcdp)); +} + +/* + * Initialize data structure and create the port. + */ +thread_communication_data_t * +setup_sigev_handler(const struct sigevent *sigevp, subsystem_t caller) +{ + thread_communication_data_t *tcdp; + int error; + + if (sigevp == NULL) { + errno = EINVAL; + return (NULL); + } + + if ((tcdp = alloc_sigev_handler(caller)) == NULL) { + errno = ENOMEM; + return (NULL); + } + + if (sigevp->sigev_notify_attributes == NULL) + tcdp->tcd_attrp = NULL; /* default attributes */ + else { + /* + * We cannot just copy the sigevp->sigev_notify_attributes + * pointer. We need to initialize a new pthread_attr_t + * structure with the values from the user-supplied + * pthread_attr_t. + */ + tcdp->tcd_attrp = &tcdp->tcd_user_attr; + error = _pthread_attr_clone(tcdp->tcd_attrp, + sigevp->sigev_notify_attributes); + if (error) { + tcdp->tcd_attrp = NULL; + free_sigev_handler(tcdp); + errno = error; + return (NULL); + } + } + tcdp->tcd_notif = *sigevp; + tcdp->tcd_notif.sigev_notify_attributes = tcdp->tcd_attrp; + + if (caller == TIMER || caller == AIO) { + if ((tcdp->tcd_port = port_create()) < 0 || + fcntl(tcdp->tcd_port, FD_CLOEXEC) == -1) { + free_sigev_handler(tcdp); + errno = EBADF; + return (NULL); + } + } + return (tcdp); +} + +/* + * Create a thread pool and launch the spawner. + */ +int +launch_spawner(thread_communication_data_t *tcdp) +{ + int ret; + int maxworkers; + void *(*spawner)(void *); + sigset_t set; + sigset_t oset; + + switch (tcdp->tcd_subsystem) { + case TIMER: + spawner = timer_spawner; + maxworkers = 1; + break; + case MQ: + spawner = mqueue_spawner; + maxworkers = 1; + break; + case AIO: + spawner = aio_spawner; + maxworkers = 100; + break; + default: + return (-1); + } + tcdp->tcd_poolp = tpool_create(1, maxworkers, 20, + tcdp->tcd_notif.sigev_notify_attributes); + if (tcdp->tcd_poolp == NULL) + return (-1); + /* create the spawner with all signals blocked */ + (void) sigfillset(&set); + (void) thr_sigsetmask(SIG_SETMASK, &set, &oset); + ret = thr_create(NULL, 0, spawner, tcdp, + THR_DETACHED | THR_DAEMON, &tcdp->tcd_server_id); + (void) thr_sigsetmask(SIG_SETMASK, &oset, NULL); + if (ret != 0) { + tpool_destroy(tcdp->tcd_poolp); + tcdp->tcd_poolp = NULL; + return (-1); + } + return (0); +} + +/* + * Delete the data associated with the sigev_thread timer, if timer is + * associated with such a notification option. + * Destroy the timer_spawner thread. + */ +int +del_sigev_timer(timer_t timer) +{ + int rc = 0; + thread_communication_data_t *tcdp; + + if ((uint_t)timer < timer_max && (tcdp = timer_tcd[timer]) != NULL) { + sig_mutex_lock(&tcdp->tcd_lock); + if (tcdp->tcd_port >= 0) { + if ((rc = port_alert(tcdp->tcd_port, + PORT_ALERT_SET, SIGEV_THREAD_TERM, NULL)) == 0) { + dprintf("del_sigev_timer(%d) OK.\n", timer); + } + } + timer_tcd[timer] = NULL; + sig_mutex_unlock(&tcdp->tcd_lock); + } + return (rc); +} + +int +sigev_timer_getoverrun(timer_t timer) +{ + thread_communication_data_t *tcdp; + + if ((uint_t)timer < timer_max && (tcdp = timer_tcd[timer]) != NULL) + return (tcdp->tcd_overruns); + return (0); +} + +static void +del_sigev_mq_cleanup(thread_communication_data_t *tcdp) +{ + sig_mutex_unlock(&tcdp->tcd_lock); + free_sigev_handler(tcdp); +} + +/* + * Delete the data associated with the sigev_thread message queue, + * if the message queue is associated with such a notification option. + * Destroy the mqueue_spawner thread. + */ +void +del_sigev_mq(thread_communication_data_t *tcdp) +{ + pthread_t server_id; + int rc; + + sig_mutex_lock(&tcdp->tcd_lock); + + server_id = tcdp->tcd_server_id; + tcdp->tcd_msg_closing = 1; + if ((rc = pthread_cancel(server_id)) != 0) { /* "can't happen" */ + sig_mutex_unlock(&tcdp->tcd_lock); + dprintf("Fail to cancel %u with error %d <%s>.\n", + server_id, rc, strerror(rc)); + return; + } + + /* + * wait for sigev_destroy_pool() to finish + */ + pthread_cleanup_push(del_sigev_mq_cleanup, tcdp); + while (tcdp->tcd_server_id == server_id) + (void) sig_cond_wait(&tcdp->tcd_cv, &tcdp->tcd_lock); + pthread_cleanup_pop(1); +} + +/* + * POSIX aio: + * If the notification type is SIGEV_THREAD, set up + * the port number for notifications. Create the + * thread pool and launch the spawner if necessary. + * If the notification type is not SIGEV_THREAD, do nothing. + */ +int +_aio_sigev_thread_init(struct sigevent *sigevp) +{ + static mutex_t sigev_aio_lock = DEFAULTMUTEX; + static cond_t sigev_aio_cv = DEFAULTCV; + static int sigev_aio_busy = 0; + + thread_communication_data_t *tcdp; + int port; + int rc = 0; + + if (sigevp == NULL || + sigevp->sigev_notify != SIGEV_THREAD || + sigevp->sigev_notify_function == NULL) + return (0); + + lmutex_lock(&sigev_aio_lock); + while (sigev_aio_busy) + (void) _cond_wait(&sigev_aio_cv, &sigev_aio_lock); + if ((tcdp = sigev_aio_tcd) != NULL) + port = tcdp->tcd_port; + else { + sigev_aio_busy = 1; + lmutex_unlock(&sigev_aio_lock); + + tcdp = setup_sigev_handler(sigevp, AIO); + if (tcdp == NULL) { + port = -1; + rc = -1; + } else if (launch_spawner(tcdp) != 0) { + free_sigev_handler(tcdp); + tcdp = NULL; + port = -1; + rc = -1; + } else { + port = tcdp->tcd_port; + } + + lmutex_lock(&sigev_aio_lock); + sigev_aio_tcd = tcdp; + sigev_aio_busy = 0; + (void) cond_broadcast(&sigev_aio_cv); + } + lmutex_unlock(&sigev_aio_lock); + sigevp->sigev_signo = port; + return (rc); +} + +int +_aio_sigev_thread(aiocb_t *aiocbp) +{ + if (aiocbp == NULL) + return (0); + return (_aio_sigev_thread_init(&aiocbp->aio_sigevent)); +} + +#if !defined(_LP64) +int +_aio_sigev_thread64(aiocb64_t *aiocbp) +{ + if (aiocbp == NULL) + return (0); + return (_aio_sigev_thread_init(&aiocbp->aio_sigevent)); +} +#endif + +/* + * Cleanup POSIX aio after fork1() in the child process. + */ +void +postfork1_child_sigev_aio(void) +{ + thread_communication_data_t *tcdp; + + if ((tcdp = sigev_aio_tcd) != NULL) { + sigev_aio_tcd = NULL; + tcd_teardown(tcdp); + } +} + +/* + * Utility function for the various postfork1_child_sigev_*() functions. + * Clean up the tcdp data structure and close the port. + */ +void +tcd_teardown(thread_communication_data_t *tcdp) +{ + if (tcdp->tcd_poolp != NULL) + tpool_abandon(tcdp->tcd_poolp); + tcdp->tcd_poolp = NULL; + tcdp->tcd_server_id = 0; + free_sigev_handler(tcdp); +} diff --git a/usr/src/lib/libc/port/rt/sigev_thread.h b/usr/src/lib/libc/port/rt/sigev_thread.h new file mode 100644 index 0000000000..943cb8ab23 --- /dev/null +++ b/usr/src/lib/libc/port/rt/sigev_thread.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SIGEV_THREAD_H +#define _SIGEV_THREAD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <signal.h> +#include <port.h> +#include <mqueue.h> +#include <time.h> +#include <limits.h> +#include <semaphore.h> +#include <thread_pool.h> + +#define SIGEV_THREAD_TERM 1 + +typedef enum {TIMER = 1, MQ, AIO} subsystem_t; /* Calling sub-system */ + +typedef struct { + void (*std_func)(union sigval); /* User-defined notification function */ + union sigval std_arg; /* Parameter of user-defined notification fct */ +} sigev_thread_data_t; + +typedef struct thread_communication_data { + struct thread_communication_data *tcd_next; + struct sigevent tcd_notif; /* encapsulates usr fct and usr vals */ + pthread_attr_t tcd_user_attr; /* copy of caller's attributes */ + pthread_attr_t *tcd_attrp; /* NULL if caller passed NULL */ + int tcd_port; /* port this spawner is controlling */ + thread_t tcd_server_id; /* thread id of server thread */ + subsystem_t tcd_subsystem; /* event generating subsystem */ + tpool_t *tcd_poolp; /* worker thread pool */ + /* for creation/termination synchronization protocol */ + mutex_t tcd_lock; + cond_t tcd_cv; + /* subsystem-specific data */ + union { + struct { + int overruns; /* number of overruns */ + } timer; + struct { + int msg_enabled; /* notification enabled */ + int msg_closing; /* mq_close() is waiting */ + sem_t *msg_avail; /* wait for message available */ + void *msg_object; /* mqd_t */ + void *msg_userval; /* notification user value */ + } mqueue; + } tcd_object; +} thread_communication_data_t; + +#define tcd_overruns tcd_object.timer.overruns + +#define tcd_msg_enabled tcd_object.mqueue.msg_enabled +#define tcd_msg_closing tcd_object.mqueue.msg_closing +#define tcd_msg_avail tcd_object.mqueue.msg_avail +#define tcd_msg_object tcd_object.mqueue.msg_object +#define tcd_msg_userval tcd_object.mqueue.msg_userval + +/* Generic functions common to all entities */ +extern thread_communication_data_t *setup_sigev_handler( + const struct sigevent *, subsystem_t); +extern void free_sigev_handler(thread_communication_data_t *); +extern int launch_spawner(thread_communication_data_t *); +extern void tcd_teardown(thread_communication_data_t *); + +/* Additional functions for different entities */ +extern void *timer_spawner(void *); +extern int del_sigev_timer(timer_t); +extern int sigev_timer_getoverrun(timer_t); +extern void *mqueue_spawner(void *); +extern void del_sigev_mq(thread_communication_data_t *); +extern void *aio_spawner(void *); + +/* Private interfaces elsewhere in libc */ +extern int _pthread_attr_clone(pthread_attr_t *, const pthread_attr_t *); +extern int _pthread_attr_equal(const pthread_attr_t *, const pthread_attr_t *); +extern int _port_dispatch(int, int, int, int, uintptr_t, void *); + +extern thread_communication_data_t *sigev_aio_tcd; + +extern int timer_max; +extern thread_communication_data_t **timer_tcd; + +#ifdef __cplusplus +} +#endif + +#endif /* _SIGEV_THREAD_H */ diff --git a/usr/src/lib/libc/port/sys/fsync.c b/usr/src/lib/libc/port/sys/fsync.c index d6827f60f3..f727d5914f 100644 --- a/usr/src/lib/libc/port/sys/fsync.c +++ b/usr/src/lib/libc/port/sys/fsync.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,15 +31,20 @@ /* * fsync(int fd) - * + * fdatasync(int fd) */ #include "synonyms.h" -#include <sys/types.h> #include "libc.h" -#include "sys/file.h" +#include <sys/file.h> int _fsync(int fd) { return (__fdsync(fd, FSYNC)); } + +int +fdatasync(int fd) +{ + return (__fdsync(fd, FDSYNC)); +} diff --git a/usr/src/lib/libc/port/sys/sigstack.c b/usr/src/lib/libc/port/sys/sigstack.c index cf4335f2a2..9f34b2386a 100644 --- a/usr/src/lib/libc/port/sys/sigstack.c +++ b/usr/src/lib/libc/port/sys/sigstack.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -30,6 +29,8 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#pragma weak sigstack = _sigstack + #include "synonyms.h" #include <sys/types.h> #include <sys/ucontext.h> diff --git a/usr/src/lib/libc/port/threads/assfail.c b/usr/src/lib/libc/port/threads/assfail.c index e64aaa87a5..989a36923a 100644 --- a/usr/src/lib/libc/port/threads/assfail.c +++ b/usr/src/lib/libc/port/threads/assfail.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -91,8 +91,8 @@ Abort(const char *msg) * Write a panic message w/o grabbing any locks other than assert_lock. * We have no idea what locks are held at this point. */ -void -thr_panic(const char *why) +static void +common_panic(const char *head, const char *why) { char msg[400]; /* no panic() message in the library is this long */ ulwp_t *self; @@ -103,7 +103,7 @@ thr_panic(const char *why) (void) _private_lwp_mutex_lock(&assert_lock); (void) _private_memset(msg, 0, sizeof (msg)); - (void) strcpy(msg, "*** libc thread failure: "); + (void) strcpy(msg, head); len1 = strlen(msg); len2 = strlen(why); if (len1 + len2 >= sizeof (msg)) @@ -116,6 +116,18 @@ thr_panic(const char *why) Abort(msg); } +void +thr_panic(const char *why) +{ + common_panic("*** libc thread failure: ", why); +} + +void +aio_panic(const char *why) +{ + common_panic("*** libc aio system failure: ", why); +} + /* * Utility function for converting a long integer to a string, avoiding stdio. * 'base' must be one of 10 or 16 @@ -370,7 +382,8 @@ thread_error(const char *msg) * We use __assfail() because the libc __assert() calls * gettext() which calls malloc() which grabs a mutex. * We do everything without calling standard i/o. - * _assfail() is an exported function, __assfail() is private to libc. + * assfail() and _assfail() are exported functions; + * __assfail() is private to libc. */ #pragma weak _assfail = __assfail void @@ -416,3 +429,17 @@ __assfail(const char *assertion, const char *filename, int line_num) */ Abort(buf); } + +/* + * We define and export this version of assfail() just because libaio + * used to define and export it, needlessly. Now that libaio is folded + * into libc, we need to continue this for ABI/version reasons. + * We don't use "#pragma weak assfail __assfail" in order to avoid + * warnings from the check_fnames utility at build time for libraries + * that define their own version of assfail(). + */ +void +assfail(const char *assertion, const char *filename, int line_num) +{ + __assfail(assertion, filename, line_num); +} diff --git a/usr/src/lib/libc/port/threads/pthr_attr.c b/usr/src/lib/libc/port/threads/pthr_attr.c index 865c573dd0..bcae664e13 100644 --- a/usr/src/lib/libc/port/threads/pthr_attr.c +++ b/usr/src/lib/libc/port/threads/pthr_attr.c @@ -88,7 +88,6 @@ _pthread_attr_destroy(pthread_attr_t *attr) /* * _pthread_attr_clone: make a copy of a pthread_attr_t. - * This is a consolidation-private interface, for librt. */ int _pthread_attr_clone(pthread_attr_t *attr, const pthread_attr_t *old_attr) @@ -231,7 +230,7 @@ _pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate) /* * pthread_attr_setdaemonstate_np: sets the daemon state to DAEMON or NONDAEMON. * PTHREAD_CREATE_DAEMON is equivalent to thr_create(THR_DAEMON). - * For now, this is a consolidation-private interface for librt. + * For now, this is a private interface in libc. */ int _pthread_attr_setdaemonstate_np(pthread_attr_t *attr, int daemonstate) @@ -249,7 +248,7 @@ _pthread_attr_setdaemonstate_np(pthread_attr_t *attr, int daemonstate) /* * pthread_attr_getdaemonstate_np: gets the daemon state. - * For now, this is a consolidation-private interface for librt. + * For now, this is a private interface in libc. */ int _pthread_attr_getdaemonstate_np(const pthread_attr_t *attr, int *daemonstate) diff --git a/usr/src/lib/libc/port/threads/pthread.c b/usr/src/lib/libc/port/threads/pthread.c index 5838a5aff7..2215647391 100644 --- a/usr/src/lib/libc/port/threads/pthread.c +++ b/usr/src/lib/libc/port/threads/pthread.c @@ -84,7 +84,7 @@ _pthread_create(pthread_t *thread, const pthread_attr_t *attr, return (EINVAL); mapped = 1; mappedpri = priority; - priority = _map_rtpri_to_gp(priority); + priority = map_rtpri_to_gp(priority); ASSERT(priority >= THREAD_MIN_PRIORITY && priority <= THREAD_MAX_PRIORITY); } @@ -236,7 +236,7 @@ _thread_setschedparam_main(pthread_t tid, int policy, } mapped = 1; mappedprio = prio; - prio = _map_rtpri_to_gp(prio); + prio = map_rtpri_to_gp(prio); ASSERT(prio >= THREAD_MIN_PRIORITY && prio <= THREAD_MAX_PRIORITY); } diff --git a/usr/src/lib/libc/port/threads/rtsched.c b/usr/src/lib/libc/port/threads/rtsched.c index 60d3357655..a85118dc5c 100644 --- a/usr/src/lib/libc/port/threads/rtsched.c +++ b/usr/src/lib/libc/port/threads/rtsched.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,9 +39,6 @@ * The following variables are used for caching information * for priocntl TS and RT scheduling classs. */ -struct pcclass ts_class, rt_class; - -static rtdpent_t *rt_dptbl; /* RT class parameter table */ static int rt_rrmin; static int rt_rrmax; static int rt_fifomin; @@ -50,87 +47,6 @@ static int rt_othermin; static int rt_othermax; /* - * Get the RT class parameter table - */ -static void -_get_rt_dptbl() -{ - struct pcclass *pccp; - pcadmin_t pcadmin; - rtadmin_t rtadmin; - size_t rtdpsize; - - pccp = &ts_class; - /* get class's info */ - (void) strcpy(pccp->pcc_info.pc_clname, "TS"); - if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&(pccp->pcc_info)) < 0) - goto out; - - pccp = &rt_class; - /* get class's info */ - (void) strcpy(pccp->pcc_info.pc_clname, "RT"); - if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&(pccp->pcc_info)) < 0) - goto out; - - /* get RT class dispatch table in rt_dptbl */ - pcadmin.pc_cid = rt_class.pcc_info.pc_cid; - pcadmin.pc_cladmin = (caddr_t)&rtadmin; - rtadmin.rt_cmd = RT_GETDPSIZE; - if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0) - goto out; - rtdpsize = rtadmin.rt_ndpents * sizeof (rtdpent_t); - if (rt_dptbl == NULL && (rt_dptbl = lmalloc(rtdpsize)) == NULL) - goto out; - rtadmin.rt_dpents = rt_dptbl; - rtadmin.rt_cmd = RT_GETDPTBL; - if (priocntl(P_PID, 0, PC_ADMIN, (caddr_t)&pcadmin) < 0) - goto out; - pccp->pcc_primin = 0; - pccp->pcc_primax = ((rtinfo_t *)rt_class.pcc_info.pc_clinfo)->rt_maxpri; - return; -out: - thr_panic("get_rt_dptbl failed"); -} - -/* - * Translate RT class's user priority to global scheduling priority. - * This is for priorities coming from librt. - */ -pri_t -_map_rtpri_to_gp(pri_t pri) -{ - static mutex_t map_lock = DEFAULTMUTEX; - static int mapped = 0; - rtdpent_t *rtdp; - pri_t gpri; - - if (!mapped) { - lmutex_lock(&map_lock); - if (!mapped) { /* do this only once */ - _get_rt_dptbl(); - mapped = 1; - } - lmutex_unlock(&map_lock); - } - - /* First case is the default case, other two are seldomly taken */ - if (pri <= rt_dptbl[rt_class.pcc_primin].rt_globpri) { - gpri = pri + rt_dptbl[rt_class.pcc_primin].rt_globpri - - rt_class.pcc_primin; - } else if (pri >= rt_dptbl[rt_class.pcc_primax].rt_globpri) { - gpri = pri + rt_dptbl[rt_class.pcc_primax].rt_globpri - - rt_class.pcc_primax; - } else { - gpri = rt_dptbl[rt_class.pcc_primin].rt_globpri + 1; - for (rtdp = rt_dptbl+1; rtdp->rt_globpri < pri; ++rtdp, ++gpri) - ; - if (rtdp->rt_globpri > pri) - --gpri; - } - return (gpri); -} - -/* * Set the RT priority/policy of a lwp/thread. */ int @@ -175,30 +91,16 @@ _thrp_setlwpprio(lwpid_t lwpid, int policy, int pri) static void _init_rt_prio_ranges() { - pcinfo_t info; - - (void) strcpy(info.pc_clname, "RT"); - if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&info) == -1L) - rt_fifomin = rt_rrmin = rt_fifomax = rt_rrmax = 0; - else { - rtinfo_t *rtinfop = (rtinfo_t *)info.pc_clinfo; - rt_fifomin = rt_rrmin = 0; - rt_fifomax = rt_rrmax = rtinfop->rt_maxpri; - } - - (void) strcpy(info.pc_clname, "TS"); - if (priocntl(P_PID, 0, PC_GETCID, (caddr_t)&info) == -1L) - rt_othermin = rt_othermax = 0; - else { - tsinfo_t *tsinfop = (tsinfo_t *)info.pc_clinfo; - pri_t pri = tsinfop->ts_maxupri / 3; - rt_othermin = -pri; - rt_othermax = pri; - } + rt_rrmin = sched_get_priority_min(SCHED_RR); + rt_rrmax = sched_get_priority_max(SCHED_RR); + rt_fifomin = sched_get_priority_min(SCHED_FIFO); + rt_fifomax = sched_get_priority_max(SCHED_FIFO); + rt_othermin = sched_get_priority_min(SCHED_OTHER); + rt_othermax = sched_get_priority_max(SCHED_OTHER); } /* - * Validate priorities from librt. + * Validate priorities. */ int _validate_rt_prio(int policy, int pri) diff --git a/usr/src/lib/libc/port/threads/scalls.c b/usr/src/lib/libc/port/threads/scalls.c index b3287040f1..67a2a6341f 100644 --- a/usr/src/lib/libc/port/threads/scalls.c +++ b/usr/src/lib/libc/port/threads/scalls.c @@ -206,7 +206,7 @@ _fork1(void) self->ul_siginfo.si_signo = 0; udp->pid = _private_getpid(); /* reset the library's data structures to reflect one thread */ - _postfork1_child(); + postfork1_child(); restore_signals(self); _postfork_child_handler(); } else { @@ -375,8 +375,8 @@ _forkall(void) } /* - * Externally-callable cancellation prologue and epilogue - * functions, for cancellation points outside of libc. + * Cancellation prologue and epilogue functions, + * for cancellation points too complex to include here. */ void _cancel_prologue(void) @@ -504,13 +504,14 @@ __xpg4_putpmsg(int fd, const struct strbuf *ctlptr, PERFORM(_putpmsg(fd, ctlptr, dataptr, band, flags|MSG_XPG4)) } +#pragma weak nanosleep = _nanosleep int -__nanosleep(const timespec_t *rqtp, timespec_t *rmtp) +_nanosleep(const timespec_t *rqtp, timespec_t *rmtp) { int error; PROLOGUE - error = ___nanosleep(rqtp, rmtp); + error = __nanosleep(rqtp, rmtp); EPILOGUE if (error) { errno = error; @@ -519,8 +520,9 @@ __nanosleep(const timespec_t *rqtp, timespec_t *rmtp) return (0); } +#pragma weak clock_nanosleep = _clock_nanosleep int -__clock_nanosleep(clockid_t clock_id, int flags, +_clock_nanosleep(clockid_t clock_id, int flags, const timespec_t *rqtp, timespec_t *rmtp) { timespec_t reltime; @@ -550,7 +552,7 @@ __clock_nanosleep(clockid_t clock_id, int flags, } restart: PROLOGUE - error = ___nanosleep(&reltime, rmtp); + error = __nanosleep(&reltime, rmtp); EPILOGUE if (error == 0 && clock_id == CLOCK_HIGHRES) { /* @@ -607,7 +609,7 @@ _sleep(unsigned int sec) ts.tv_sec = (time_t)sec; ts.tv_nsec = 0; PROLOGUE - error = ___nanosleep(&ts, &tsr); + error = __nanosleep(&ts, &tsr); EPILOGUE if (error == EINTR) { rem = (unsigned int)tsr.tv_sec; @@ -626,7 +628,7 @@ _usleep(useconds_t usec) ts.tv_sec = usec / MICROSEC; ts.tv_nsec = (long)(usec % MICROSEC) * 1000; PROLOGUE - (void) ___nanosleep(&ts, NULL); + (void) __nanosleep(&ts, NULL); EPILOGUE return (0); } @@ -634,9 +636,11 @@ _usleep(useconds_t usec) int close(int fildes) { + extern void _aio_close(int); extern int _close(int); int rv; + _aio_close(fildes); PERFORM(_close(fildes)) } @@ -856,17 +860,17 @@ _pollsys(struct pollfd *fds, nfds_t nfd, const timespec_t *timeout, return (rv); } +#pragma weak sigtimedwait = _sigtimedwait int -__sigtimedwait(const sigset_t *set, siginfo_t *infop, - const timespec_t *timeout) +_sigtimedwait(const sigset_t *set, siginfo_t *infop, const timespec_t *timeout) { - extern int ___sigtimedwait(const sigset_t *, siginfo_t *, + extern int __sigtimedwait(const sigset_t *, siginfo_t *, const timespec_t *); siginfo_t info; int sig; PROLOGUE - sig = ___sigtimedwait(set, &info, timeout); + sig = __sigtimedwait(set, &info, timeout); if (sig == SIGCANCEL && (SI_FROMKERNEL(&info) || info.si_code == SI_LWP)) { do_sigcancel(); @@ -883,7 +887,23 @@ __sigtimedwait(const sigset_t *set, siginfo_t *infop, int _sigwait(sigset_t *set) { - return (__sigtimedwait(set, NULL, NULL)); + return (_sigtimedwait(set, NULL, NULL)); +} + +#pragma weak sigwaitinfo = _sigwaitinfo +int +_sigwaitinfo(const sigset_t *set, siginfo_t *info) +{ + return (_sigtimedwait(set, info, NULL)); +} + +#pragma weak sigqueue = _sigqueue +int +_sigqueue(pid_t pid, int signo, const union sigval value) +{ + extern int __sigqueue(pid_t pid, int signo, + /* const union sigval */ void *value, int si_code, int block); + return (__sigqueue(pid, signo, value.sival_ptr, SI_QUEUE, 0)); } int diff --git a/usr/src/lib/libc/port/threads/sigaction.c b/usr/src/lib/libc/port/threads/sigaction.c index 670598961f..101b730af3 100644 --- a/usr/src/lib/libc/port/threads/sigaction.c +++ b/usr/src/lib/libc/port/threads/sigaction.c @@ -28,6 +28,7 @@ #include "lint.h" #include "thr_uberdata.h" +#include "asyncio.h" #include <signal.h> #include <siginfo.h> #include <ucontext.h> @@ -154,6 +155,22 @@ call_user_handler(int sig, siginfo_t *sip, ucontext_t *ucp) do_sigcancel(); goto out; } + /* SIGCANCEL is ignored by default */ + if (uact.sa_sigaction == SIG_DFL || + uact.sa_sigaction == SIG_IGN) + goto out; + } + + /* + * If this thread has been sent SIGAIOCANCEL (SIGLWP) and + * we are an aio worker thread, cancel the aio request. + */ + if (sig == SIGAIOCANCEL) { + aio_worker_t *aiowp = _pthread_getspecific(_aio_key); + + if (sip != NULL && sip->si_code == SI_LWP && aiowp != NULL) + _siglongjmp(aiowp->work_jmp_buf, 1); + /* SIGLWP is ignored by default */ if (uact.sa_sigaction == SIG_DFL || uact.sa_sigaction == SIG_IGN) goto out; @@ -289,10 +306,9 @@ sigacthandler(int sig, siginfo_t *sip, void *uvp) thr_panic("sigacthandler(): __setcontext() returned"); } -#pragma weak sigaction = _libc_sigaction -#pragma weak _sigaction = _libc_sigaction +#pragma weak sigaction = _sigaction int -_libc_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact) +_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact) { ulwp_t *self = curthread; uberdata_t *udp = self->ul_uberdata; @@ -341,10 +357,11 @@ _libc_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact) if (self->ul_vfork) { if (tact.sa_sigaction != SIG_IGN) tact.sa_sigaction = SIG_DFL; - } else if (sig == SIGCANCEL) { + } else if (sig == SIGCANCEL || sig == SIGAIOCANCEL) { /* - * Always catch SIGCANCEL. - * We need it for pthread_cancel() to work. + * Always catch these signals. + * We need SIGCANCEL for pthread_cancel() to work. + * We need SIGAIOCANCEL for aio_cancel() to work. */ udp->siguaction[sig].sig_uaction = tact; if (tact.sa_sigaction == SIG_DFL || @@ -372,6 +389,16 @@ _libc_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact) oact->sa_sigaction != SIG_IGN) *oact = oaction; + /* + * We detect setting the disposition of SIGIO just to set the + * _sigio_enabled flag for the asynchronous i/o (aio) code. + */ + if (sig == SIGIO && rv == 0 && tactp != NULL) { + _sigio_enabled = + (tactp->sa_handler != SIG_DFL && + tactp->sa_handler != SIG_IGN); + } + if (!self->ul_vfork) lmutex_unlock(&udp->siguaction[sig].sig_lock); return (rv); @@ -619,18 +646,22 @@ do_sigcancel() } /* - * Set up the SIGCANCEL handler for threads cancellation - * (needed only when we have more than one thread). - * We need no locks here because we are called from - * finish_init() while still single-threaded. + * Set up the SIGCANCEL handler for threads cancellation, + * needed only when we have more than one thread, + * or the SIGAIOCANCEL handler for aio cancellation, + * called when aio is initialized, in __uaio_init(). */ void -init_sigcancel() +setup_cancelsig(int sig) { uberdata_t *udp = curthread->ul_uberdata; + mutex_t *mp = &udp->siguaction[sig].sig_lock; struct sigaction act; - act = udp->siguaction[SIGCANCEL].sig_uaction; + ASSERT(sig == SIGCANCEL || sig == SIGAIOCANCEL); + lmutex_lock(mp); + act = udp->siguaction[sig].sig_uaction; + lmutex_unlock(mp); if (act.sa_sigaction == SIG_DFL || act.sa_sigaction == SIG_IGN) act.sa_flags = SA_SIGINFO; @@ -640,5 +671,5 @@ init_sigcancel() } act.sa_sigaction = udp->sigacthandler; act.sa_mask = maskset; - (void) __sigaction(SIGCANCEL, &act, NULL); + (void) __sigaction(sig, &act, NULL); } diff --git a/usr/src/lib/libc/port/threads/spawn.c b/usr/src/lib/libc/port/threads/spawn.c index 18a6d68e0b..143db8cf49 100644 --- a/usr/src/lib/libc/port/threads/spawn.c +++ b/usr/src/lib/libc/port/threads/spawn.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/ts.h> #include <alloca.h> #include <spawn.h> +#include "rtsched.h" #define ALL_POSIX_SPAWN_FLAGS \ (POSIX_SPAWN_RESETIDS | \ @@ -65,6 +66,8 @@ typedef struct file_attr { int fa_newfiledes; /* new file descriptor for dup2() */ } file_attr_t; +extern struct pcclass ts_class, rt_class; + extern pid_t _vfork(void); #pragma unknown_control_flow(_vfork) extern void *_private_memset(void *, int, size_t); @@ -631,7 +634,10 @@ _posix_spawnattr_setflags( * Populate ts_class and rt_class. * We will need them in the child of vfork(). */ - (void) _map_rtpri_to_gp(0); + if (rt_class.pcc_state == 0) + (void) get_info_by_policy(SCHED_FIFO); + if (ts_class.pcc_state == 0) + (void) get_info_by_policy(SCHED_OTHER); } sap->sa_psflags = flags; diff --git a/usr/src/lib/libc/port/threads/synch.c b/usr/src/lib/libc/port/threads/synch.c index 6856ebcc6b..9c6e918620 100644 --- a/usr/src/lib/libc/port/threads/synch.c +++ b/usr/src/lib/libc/port/threads/synch.c @@ -2184,6 +2184,77 @@ lmutex_unlock(mutex_t *mp) exit_critical(self); } +/* + * For specialized code in libc, like the asynchronous i/o code, + * the following sig_*() locking primitives are used in order + * to make the code asynchronous signal safe. Signals are + * deferred while locks acquired by these functions are held. + */ +void +sig_mutex_lock(mutex_t *mp) +{ + sigoff(curthread); + (void) _private_mutex_lock(mp); +} + +void +sig_mutex_unlock(mutex_t *mp) +{ + (void) _private_mutex_unlock(mp); + sigon(curthread); +} + +int +sig_mutex_trylock(mutex_t *mp) +{ + int error; + + sigoff(curthread); + if ((error = _private_mutex_trylock(mp)) != 0) + sigon(curthread); + return (error); +} + +/* + * sig_cond_wait() is a cancellation point. + */ +int +sig_cond_wait(cond_t *cv, mutex_t *mp) +{ + int error; + + ASSERT(curthread->ul_sigdefer != 0); + _private_testcancel(); + error = _cond_wait(cv, mp); + if (error == EINTR && curthread->ul_cursig) { + sig_mutex_unlock(mp); + /* take the deferred signal here */ + sig_mutex_lock(mp); + } + _private_testcancel(); + return (error); +} + +/* + * sig_cond_reltimedwait() is a cancellation point. + */ +int +sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts) +{ + int error; + + ASSERT(curthread->ul_sigdefer != 0); + _private_testcancel(); + error = _cond_reltimedwait(cv, mp, ts); + if (error == EINTR && curthread->ul_cursig) { + sig_mutex_unlock(mp); + /* take the deferred signal here */ + sig_mutex_lock(mp); + } + _private_testcancel(); + return (error); +} + static int shared_mutex_held(mutex_t *mparg) { diff --git a/usr/src/lib/libc/port/threads/thr.c b/usr/src/lib/libc/port/threads/thr.c index 37310cea56..2a9f9e89e1 100644 --- a/usr/src/lib/libc/port/threads/thr.c +++ b/usr/src/lib/libc/port/threads/thr.c @@ -1490,6 +1490,9 @@ libc_init(void) if (self->ul_primarymap && __tnf_probe_notify != NULL) __tnf_probe_notify(); /* PROBE_SUPPORT end */ + + init_sigev_thread(); + init_aio(); } #pragma fini(libc_fini) @@ -1562,7 +1565,7 @@ finish_init() /* * Set up the SIGCANCEL handler for threads cancellation. */ - init_sigcancel(); + setup_cancelsig(SIGCANCEL); /* * Arrange to do special things on exit -- @@ -1596,7 +1599,7 @@ mark_dead_and_buried(ulwp_t *ulwp) * Reset our data structures to reflect one lwp. */ void -_postfork1_child() +postfork1_child() { ulwp_t *self = curthread; uberdata_t *udp = self->ul_uberdata; @@ -1668,6 +1671,15 @@ _postfork1_child() udp->nzombies = 0; } trim_stack_cache(0); + + /* + * Do post-fork1 processing for subsystems that need it. + */ + postfork1_child_tpool(); + postfork1_child_sigev_aio(); + postfork1_child_sigev_mq(); + postfork1_child_sigev_timer(); + postfork1_child_aio(); } #pragma weak thr_setprio = _thr_setprio @@ -1761,7 +1773,7 @@ force_continue(ulwp_t *ulwp) if (ulwp->ul_stopping) { /* he is stopping himself */ ts.tv_sec = 0; /* give him a chance to run */ ts.tv_nsec = 100000; /* 100 usecs or clock tick */ - (void) ___nanosleep(&ts, NULL); + (void) __nanosleep(&ts, NULL); } if (!ulwp->ul_stopping) /* he is running now */ break; /* so we are done */ @@ -2203,10 +2215,8 @@ _ti_bind_clear(int bindflag) * Also, signals are deferred at thread startup until TLS constructors * have all been called, at which time _thr_setup() calls sigon(). * - * _sigoff() and _sigon() are external consolidation-private interfaces - * to sigoff() and sigon(), respectively, in libc. _sigdeferred() is - * a consolidation-private interface that returns the deferred signal - * number, if any. These are used in libnsl, librt, and libaio. + * _sigoff() and _sigon() are external consolidation-private interfaces to + * sigoff() and sigon(), respectively, in libc. These are used in libnsl. * Also, _sigoff() and _sigon() are called from dbx's run-time checking * (librtc.so) to defer signals during its critical sections (not to be * confused with libc critical sections [see exit_critical() above]). @@ -2223,12 +2233,6 @@ _sigon(void) sigon(curthread); } -int -_sigdeferred(void) -{ - return (curthread->ul_cursig); -} - void sigon(ulwp_t *self) { diff --git a/usr/src/lib/libc/port/tpool/thread_pool.c b/usr/src/lib/libc/port/tpool/thread_pool.c new file mode 100644 index 0000000000..5042f60301 --- /dev/null +++ b/usr/src/lib/libc/port/tpool/thread_pool.c @@ -0,0 +1,560 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "synonyms.h" +#include "thr_uberdata.h" +#include <stdlib.h> +#include <signal.h> +#include <errno.h> +#include "thread_pool_impl.h" + +static mutex_t thread_pool_lock = DEFAULTMUTEX; +static tpool_t *thread_pools = NULL; + +static void +delete_pool(tpool_t *tpool) +{ + tpool_job_t *job; + + ASSERT(tpool->tp_current == 0 && tpool->tp_active == NULL); + + /* + * Unlink the pool from the global list of all pools. + */ + lmutex_lock(&thread_pool_lock); + if (thread_pools == tpool) + thread_pools = tpool->tp_forw; + if (thread_pools == tpool) + thread_pools = NULL; + else { + tpool->tp_back->tp_forw = tpool->tp_forw; + tpool->tp_forw->tp_back = tpool->tp_back; + } + lmutex_unlock(&thread_pool_lock); + + /* + * There should be no pending jobs, but just in case... + */ + for (job = tpool->tp_head; job != NULL; job = tpool->tp_head) { + tpool->tp_head = job->tpj_next; + lfree(job, sizeof (*job)); + } + (void) pthread_attr_destroy(&tpool->tp_attr); + lfree(tpool, sizeof (*tpool)); +} + +/* + * Worker thread is terminating. + */ +static void +worker_cleanup(tpool_t *tpool) +{ + ASSERT(MUTEX_HELD(&tpool->tp_mutex)); + + if (--tpool->tp_current == 0 && + (tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) { + if (tpool->tp_flags & TP_ABANDON) { + sig_mutex_unlock(&tpool->tp_mutex); + delete_pool(tpool); + return; + } + if (tpool->tp_flags & TP_DESTROY) + (void) cond_broadcast(&tpool->tp_busycv); + } + sig_mutex_unlock(&tpool->tp_mutex); +} + +static void +notify_waiters(tpool_t *tpool) +{ + if (tpool->tp_head == NULL && tpool->tp_active == NULL) { + tpool->tp_flags &= ~TP_WAIT; + (void) cond_broadcast(&tpool->tp_waitcv); + } +} + +/* + * Called by a worker thread on return from a tpool_dispatch()d job. + */ +static void +job_cleanup(tpool_t *tpool) +{ + pthread_t my_tid = pthread_self(); + tpool_active_t *activep; + tpool_active_t **activepp; + + sig_mutex_lock(&tpool->tp_mutex); + /* CSTYLED */ + for (activepp = &tpool->tp_active;; activepp = &activep->tpa_next) { + activep = *activepp; + if (activep->tpa_tid == my_tid) { + *activepp = activep->tpa_next; + break; + } + } + if (tpool->tp_flags & TP_WAIT) + notify_waiters(tpool); +} + +static void * +tpool_worker(void *arg) +{ + tpool_t *tpool = (tpool_t *)arg; + int elapsed; + tpool_job_t *job; + void (*func)(void *); + tpool_active_t active; + + sig_mutex_lock(&tpool->tp_mutex); + pthread_cleanup_push(worker_cleanup, tpool); + + /* + * This is the worker's main loop. + * It will only be left if a timeout or an error has occured. + */ + active.tpa_tid = pthread_self(); + for (;;) { + elapsed = 0; + tpool->tp_idle++; + if (tpool->tp_flags & TP_WAIT) + notify_waiters(tpool); + while ((tpool->tp_head == NULL || + (tpool->tp_flags & TP_SUSPEND)) && + !(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) { + if (tpool->tp_current <= tpool->tp_minimum || + tpool->tp_linger == 0) { + (void) sig_cond_wait(&tpool->tp_workcv, + &tpool->tp_mutex); + } else { + timestruc_t timeout; + + timeout.tv_sec = tpool->tp_linger; + timeout.tv_nsec = 0; + if (sig_cond_reltimedwait(&tpool->tp_workcv, + &tpool->tp_mutex, &timeout) != 0) { + elapsed = 1; + break; + } + } + } + tpool->tp_idle--; + if (tpool->tp_flags & TP_DESTROY) + break; + if (tpool->tp_flags & TP_ABANDON) { + /* can't abandon a suspended pool */ + if (tpool->tp_flags & TP_SUSPEND) { + tpool->tp_flags &= ~TP_SUSPEND; + (void) cond_broadcast(&tpool->tp_workcv); + } + if (tpool->tp_head == NULL) + break; + } + if ((job = tpool->tp_head) != NULL && + !(tpool->tp_flags & TP_SUSPEND)) { + elapsed = 0; + func = job->tpj_func; + arg = job->tpj_arg; + tpool->tp_head = job->tpj_next; + if (job == tpool->tp_tail) + tpool->tp_tail = NULL; + tpool->tp_njobs--; + active.tpa_next = tpool->tp_active; + tpool->tp_active = &active; + sig_mutex_unlock(&tpool->tp_mutex); + pthread_cleanup_push(job_cleanup, tpool); + lfree(job, sizeof (*job)); + /* + * Call the specified function. + */ + func(arg); + /* + * We don't know what this thread has been doing, + * so we reset its signal mask and cancellation + * state back to the initial values. + */ + (void) pthread_sigmask(SIG_SETMASK, &maskset, NULL); + (void) pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, + NULL); + (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, + NULL); + pthread_cleanup_pop(1); + } + if (elapsed && tpool->tp_current > tpool->tp_minimum) { + /* + * We timed out and there is no work to be done + * and the number of workers exceeds the minimum. + * Exit now to reduce the size of the pool. + */ + break; + } + } + pthread_cleanup_pop(1); + return (arg); +} + +/* + * Create a worker thread, with all signals blocked. + */ +static int +create_worker(tpool_t *tpool) +{ + sigset_t oset; + int error; + + (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); + error = pthread_create(NULL, &tpool->tp_attr, tpool_worker, tpool); + (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); + return (error); +} + +tpool_t * +tpool_create(uint_t min_threads, uint_t max_threads, uint_t linger, + pthread_attr_t *attr) +{ + tpool_t *tpool; + void *stackaddr; + size_t stacksize; + size_t minstack; + int error; + + if (min_threads > max_threads || max_threads < 1) { + errno = EINVAL; + return (NULL); + } + if (attr != NULL) { + if (pthread_attr_getstack(attr, &stackaddr, &stacksize) != 0) { + errno = EINVAL; + return (NULL); + } + /* + * Allow only one thread in the pool with a specified stack. + * Require threads to have at least the minimum stack size. + */ + minstack = thr_min_stack(); + if (stackaddr != NULL) { + if (stacksize < minstack || max_threads != 1) { + errno = EINVAL; + return (NULL); + } + } else if (stacksize != 0 && stacksize < minstack) { + errno = EINVAL; + return (NULL); + } + } + + tpool = lmalloc(sizeof (*tpool)); + if (tpool == NULL) { + errno = ENOMEM; + return (NULL); + } + (void) mutex_init(&tpool->tp_mutex, USYNC_THREAD, NULL); + (void) cond_init(&tpool->tp_busycv, USYNC_THREAD, NULL); + (void) cond_init(&tpool->tp_workcv, USYNC_THREAD, NULL); + (void) cond_init(&tpool->tp_waitcv, USYNC_THREAD, NULL); + tpool->tp_minimum = min_threads; + tpool->tp_maximum = max_threads; + tpool->tp_linger = linger; + + /* + * We cannot just copy the attribute pointer. + * We need to initialize a new pthread_attr_t structure + * with the values from the user-supplied pthread_attr_t. + * If the attribute pointer is NULL, we need to initialize + * the new pthread_attr_t structure with default values. + */ + error = _pthread_attr_clone(&tpool->tp_attr, attr); + if (error) { + lfree(tpool, sizeof (*tpool)); + errno = error; + return (NULL); + } + + /* make all pool threads be detached daemon threads */ + (void) pthread_attr_setdetachstate(&tpool->tp_attr, + PTHREAD_CREATE_DETACHED); + (void) _pthread_attr_setdaemonstate_np(&tpool->tp_attr, + PTHREAD_CREATE_DAEMON_NP); + + /* insert into the global list of all thread pools */ + lmutex_lock(&thread_pool_lock); + if (thread_pools == NULL) { + tpool->tp_forw = tpool; + tpool->tp_back = tpool; + thread_pools = tpool; + } else { + thread_pools->tp_back->tp_forw = tpool; + tpool->tp_forw = thread_pools; + tpool->tp_back = thread_pools->tp_back; + thread_pools->tp_back = tpool; + } + lmutex_unlock(&thread_pool_lock); + + return (tpool); +} + +/* + * Dispatch a work request to the thread pool. + * If there are idle workers, awaken one. + * Else, if the maximum number of workers has + * not been reached, spawn a new worker thread. + * Else just return with the job added to the queue. + */ +int +tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg) +{ + tpool_job_t *job; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + if ((job = lmalloc(sizeof (*job))) == NULL) + return (-1); + job->tpj_next = NULL; + job->tpj_func = func; + job->tpj_arg = arg; + + sig_mutex_lock(&tpool->tp_mutex); + + if (tpool->tp_head == NULL) + tpool->tp_head = job; + else + tpool->tp_tail->tpj_next = job; + tpool->tp_tail = job; + tpool->tp_njobs++; + + if (!(tpool->tp_flags & TP_SUSPEND)) { + if (tpool->tp_idle > 0) + (void) cond_signal(&tpool->tp_workcv); + else if (tpool->tp_current < tpool->tp_maximum && + create_worker(tpool) == 0) + tpool->tp_current++; + } + + sig_mutex_unlock(&tpool->tp_mutex); + return (0); +} + +/* + * Assumes: by the time tpool_destroy() is called no one will use this + * thread pool in any way and no one will try to dispatch entries to it. + * Calling tpool_destroy() from a job in the pool will cause deadlock. + */ +void +tpool_destroy(tpool_t *tpool) +{ + tpool_active_t *activep; + + ASSERT(!tpool_member(tpool)); + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + pthread_cleanup_push(sig_mutex_unlock, &tpool->tp_mutex); + + /* mark the pool as being destroyed; wakeup idle workers */ + tpool->tp_flags |= TP_DESTROY; + tpool->tp_flags &= ~TP_SUSPEND; + (void) cond_broadcast(&tpool->tp_workcv); + + /* cancel all active workers */ + for (activep = tpool->tp_active; activep; activep = activep->tpa_next) + (void) pthread_cancel(activep->tpa_tid); + + /* wait for all active workers to finish */ + while (tpool->tp_active != NULL) { + tpool->tp_flags |= TP_WAIT; + (void) sig_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex); + } + + /* the last worker to terminate will wake us up */ + while (tpool->tp_current != 0) + (void) sig_cond_wait(&tpool->tp_busycv, &tpool->tp_mutex); + + pthread_cleanup_pop(1); /* sig_mutex_unlock(&tpool->tp_mutex); */ + delete_pool(tpool); +} + +/* + * Like tpool_destroy(), but don't cancel workers or wait for them to finish. + * The last worker to terminate will delete the pool. + */ +void +tpool_abandon(tpool_t *tpool) +{ + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + if (tpool->tp_current == 0) { + /* no workers, just delete the pool */ + sig_mutex_unlock(&tpool->tp_mutex); + delete_pool(tpool); + } else { + /* wake up all workers, last one will delete the pool */ + tpool->tp_flags |= TP_ABANDON; + tpool->tp_flags &= ~TP_SUSPEND; + (void) cond_broadcast(&tpool->tp_workcv); + sig_mutex_unlock(&tpool->tp_mutex); + } +} + +/* + * Wait for all jobs to complete. + * Calling tpool_wait() from a job in the pool will cause deadlock. + */ +void +tpool_wait(tpool_t *tpool) +{ + ASSERT(!tpool_member(tpool)); + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + pthread_cleanup_push(sig_mutex_unlock, &tpool->tp_mutex); + while (tpool->tp_head != NULL || tpool->tp_active != NULL) { + tpool->tp_flags |= TP_WAIT; + (void) sig_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex); + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + } + pthread_cleanup_pop(1); /* sig_mutex_unlock(&tpool->tp_mutex); */ +} + +void +tpool_suspend(tpool_t *tpool) +{ + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + tpool->tp_flags |= TP_SUSPEND; + sig_mutex_unlock(&tpool->tp_mutex); +} + +int +tpool_suspended(tpool_t *tpool) +{ + int suspended; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + suspended = (tpool->tp_flags & TP_SUSPEND) != 0; + sig_mutex_unlock(&tpool->tp_mutex); + + return (suspended); +} + +void +tpool_resume(tpool_t *tpool) +{ + int excess; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + if (!(tpool->tp_flags & TP_SUSPEND)) { + sig_mutex_unlock(&tpool->tp_mutex); + return; + } + tpool->tp_flags &= ~TP_SUSPEND; + (void) cond_broadcast(&tpool->tp_workcv); + excess = tpool->tp_njobs - tpool->tp_idle; + while (excess-- > 0 && tpool->tp_current < tpool->tp_maximum) { + if (create_worker(tpool) != 0) + break; /* pthread_create() failed */ + tpool->tp_current++; + } + sig_mutex_unlock(&tpool->tp_mutex); +} + +int +tpool_member(tpool_t *tpool) +{ + pthread_t my_tid = pthread_self(); + tpool_active_t *activep; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + sig_mutex_lock(&tpool->tp_mutex); + for (activep = tpool->tp_active; activep; activep = activep->tpa_next) { + if (activep->tpa_tid == my_tid) { + sig_mutex_unlock(&tpool->tp_mutex); + return (1); + } + } + sig_mutex_unlock(&tpool->tp_mutex); + return (0); +} + +void +postfork1_child_tpool(void) +{ + pthread_t my_tid = pthread_self(); + tpool_t *tpool; + tpool_job_t *job; + + /* + * All of the thread pool workers are gone, except possibly + * for the current thread, if it is a thread pool worker thread. + * Retain the thread pools, but make them all empty. Whatever + * jobs were queued or running belong to the parent process. + */ +top: + if ((tpool = thread_pools) == NULL) + return; + + do { + tpool_active_t *activep; + + (void) mutex_init(&tpool->tp_mutex, USYNC_THREAD, NULL); + (void) cond_init(&tpool->tp_busycv, USYNC_THREAD, NULL); + (void) cond_init(&tpool->tp_workcv, USYNC_THREAD, NULL); + (void) cond_init(&tpool->tp_waitcv, USYNC_THREAD, NULL); + for (job = tpool->tp_head; job; job = tpool->tp_head) { + tpool->tp_head = job->tpj_next; + lfree(job, sizeof (*job)); + } + tpool->tp_tail = NULL; + tpool->tp_njobs = 0; + for (activep = tpool->tp_active; activep; + activep = activep->tpa_next) { + if (activep->tpa_tid == my_tid) { + activep->tpa_next = NULL; + break; + } + } + tpool->tp_idle = 0; + tpool->tp_current = 0; + if ((tpool->tp_active = activep) != NULL) + tpool->tp_current = 1; + tpool->tp_flags &= ~TP_WAIT; + if (tpool->tp_flags & (TP_DESTROY | TP_ABANDON)) { + tpool->tp_flags &= ~TP_DESTROY; + tpool->tp_flags |= TP_ABANDON; + if (tpool->tp_current == 0) { + delete_pool(tpool); + goto top; /* start over */ + } + } + } while ((tpool = tpool->tp_forw) != thread_pools); +} diff --git a/usr/src/lib/libc/port/tpool/thread_pool_impl.h b/usr/src/lib/libc/port/tpool/thread_pool_impl.h new file mode 100644 index 0000000000..66611778a0 --- /dev/null +++ b/usr/src/lib/libc/port/tpool/thread_pool_impl.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _THREAD_POOL_IMPL_H +#define _THREAD_POOL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <thread_pool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Thread pool implementation definitions. + * See <thread_pool.h> for interface declarations. + */ + +/* + * FIFO queued job + */ +typedef struct tpool_job tpool_job_t; +struct tpool_job { + tpool_job_t *tpj_next; /* list of jobs */ + void (*tpj_func)(void *); /* function to call */ + void *tpj_arg; /* its argument */ +}; + +/* + * List of active threads, linked through their stacks. + */ +typedef struct tpool_active tpool_active_t; +struct tpool_active { + tpool_active_t *tpa_next; /* list of active threads */ + pthread_t tpa_tid; /* active thread id */ +}; + +/* + * The thread pool. + */ +struct tpool { + tpool_t *tp_forw; /* circular list of all thread pools */ + tpool_t *tp_back; + mutex_t tp_mutex; /* protects the pool data */ + cond_t tp_busycv; /* synchronization in tpool_dispatch */ + cond_t tp_workcv; /* synchronization with workers */ + cond_t tp_waitcv; /* synchronization in tpool_wait() */ + tpool_active_t *tp_active; /* threads performing work */ + tpool_job_t *tp_head; /* FIFO job queue */ + tpool_job_t *tp_tail; + pthread_attr_t tp_attr; /* attributes of the workers */ + int tp_flags; /* see below */ + uint_t tp_linger; /* seconds before idle workers exit */ + int tp_njobs; /* number of jobs in job queue */ + int tp_minimum; /* minimum number of worker threads */ + int tp_maximum; /* maximum number of worker threads */ + int tp_current; /* current number of worker threads */ + int tp_idle; /* number of idle workers */ +}; + +/* tp_flags */ +#define TP_WAIT 0x01 /* waiting in tpool_wait() */ +#define TP_SUSPEND 0x02 /* pool is being suspended */ +#define TP_DESTROY 0x04 /* pool is being destroyed */ +#define TP_ABANDON 0x08 /* pool is abandoned (auto-destroy) */ + +extern int _pthread_attr_clone(pthread_attr_t *, const pthread_attr_t *); + +extern const sigset_t maskset; /* set of all maskable signals */ + +#ifdef __cplusplus +} +#endif + +#endif /* _THREAD_POOL_IMPL_H */ diff --git a/usr/src/lib/libc/sparc/Makefile b/usr/src/lib/libc/sparc/Makefile index 7ce71b3756..50fa5f8c45 100644 --- a/usr/src/lib/libc/sparc/Makefile +++ b/usr/src/lib/libc/sparc/Makefile @@ -777,6 +777,24 @@ PORTI18N_COND= \ wcstol_longlong.o \ wcstoul_longlong.o +AIOOBJS= \ + aio.o \ + aio_alloc.o \ + posix_aio.o \ + +RTOBJS= \ + clock_timer.o \ + fallocate.o \ + mqueue.o \ + pos4obj.o \ + sched.o \ + sem.o \ + shm.o \ + sigev_thread.o + +TPOOLOBJS= \ + thread_pool.o + THREADSOBJS= \ alloc.o \ assfail.o \ @@ -899,6 +917,9 @@ MOSTOBJS= \ $(PORTSTDIO_W) \ $(PORTSYS) \ $(PORTSYS64) \ + $(AIOOBJS) \ + $(RTOBJS) \ + $(TPOOLOBJS) \ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ @@ -1004,6 +1025,9 @@ SRCS= \ $(PORTREGEX:%.o=../port/regex/%.c) \ $(PORTSTDIO:%.o=../port/stdio/%.c) \ $(PORTSYS:%.o=../port/sys/%.c) \ + $(AIOOBJS:%.o=../port/aio/%.c) \ + $(RTOBJS:%.o=../port/rt/%.c) \ + $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \ $(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \ @@ -1033,6 +1057,7 @@ $(MAPFILE): # Files which need the threads .il inline template TIL= \ + aio.o \ alloc.o \ assfail.o \ atexit.o \ @@ -1042,7 +1067,9 @@ TIL= \ errno.o \ getctxt.o \ lwp.o \ + ma.o \ machdep.o \ + posix_aio.o \ pthr_attr.o \ pthr_barrier.o \ pthr_cond.o \ @@ -1055,6 +1082,7 @@ TIL= \ scalls.o \ sema.o \ sigaction.o \ + sigev_thread.o \ spawn.o \ stack.o \ swapctxt.o \ @@ -1062,6 +1090,7 @@ TIL= \ tdb_agent.o \ thr.o \ thread_interface.o \ + thread_pool.o \ tls.o \ tsd.o \ unwind.o diff --git a/usr/src/lib/libc/sparcv9/Makefile b/usr/src/lib/libc/sparcv9/Makefile index e5810b8bd2..3918386307 100644 --- a/usr/src/lib/libc/sparcv9/Makefile +++ b/usr/src/lib/libc/sparcv9/Makefile @@ -725,6 +725,24 @@ PORTI18N_COND= \ wcstol_longlong.o \ wcstoul_longlong.o +AIOOBJS= \ + aio.o \ + aio_alloc.o \ + posix_aio.o \ + +RTOBJS= \ + clock_timer.o \ + fallocate.o \ + mqueue.o \ + pos4obj.o \ + sched.o \ + sem.o \ + shm.o \ + sigev_thread.o + +TPOOLOBJS= \ + thread_pool.o + THREADSOBJS= \ alloc.o \ assfail.o \ @@ -844,6 +862,9 @@ MOSTOBJS= \ $(PORTSTDIO_W) \ $(PORTSYS) \ $(PORTSYS64) \ + $(AIOOBJS) \ + $(RTOBJS) \ + $(TPOOLOBJS) \ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ @@ -949,6 +970,9 @@ SRCS= \ $(PORTREGEX:%.o=../port/regex/%.c) \ $(PORTSTDIO:%.o=../port/stdio/%.c) \ $(PORTSYS:%.o=../port/sys/%.c) \ + $(AIOOBJS:%.o=../port/aio/%.c) \ + $(RTOBJS:%.o=../port/rt/%.c) \ + $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \ $(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \ @@ -977,6 +1001,7 @@ $(MAPFILE): # Files which need the threads .il inline template TIL= \ + aio.o \ alloc.o \ assfail.o \ atexit.o \ @@ -986,7 +1011,9 @@ TIL= \ errno.o \ getctxt.o \ lwp.o \ + ma.o \ machdep.o \ + posix_aio.o \ pthr_attr.o \ pthr_barrier.o \ pthr_cond.o \ @@ -999,6 +1026,7 @@ TIL= \ scalls.o \ sema.o \ sigaction.o \ + sigev_thread.o \ spawn.o \ stack.o \ swapctxt.o \ @@ -1006,6 +1034,7 @@ TIL= \ tdb_agent.o \ thr.o \ thread_interface.o \ + thread_pool.o \ tls.o \ tsd.o \ unwind.o diff --git a/usr/src/lib/libc/spec/Makefile.targ b/usr/src/lib/libc/spec/Makefile.targ index 7b8d73ce11..4243823247 100644 --- a/usr/src/lib/libc/spec/Makefile.targ +++ b/usr/src/lib/libc/spec/Makefile.targ @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -30,7 +29,8 @@ LIBRARY = libc.a VERS = .1 -OBJECTS = atomic.o \ +OBJECTS = aio.o \ + atomic.o \ data.o \ door.o \ fmtmsg.o \ @@ -43,6 +43,7 @@ OBJECTS = atomic.o \ private.o \ privatedata.o \ regex.o \ + rt.o \ stdio.o \ sys.o \ threads.o \ diff --git a/usr/src/lib/libc/spec/aio.spec b/usr/src/lib/libc/spec/aio.spec new file mode 100644 index 0000000000..6b2612210e --- /dev/null +++ b/usr/src/lib/libc/spec/aio.spec @@ -0,0 +1,83 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +function aiocancel +include <sys/asynch.h>, <aio.h> +declaration int aiocancel(aio_result_t *resultp) +version SUNW_1.23 +errno EACCES EFAULT EINVAL +exception $return == -1 +end + +function aioread +include <sys/types.h>, <sys/asynch.h>, <aio.h> +declaration int aioread(int fildes, char *bufp, int bufs, \ + off_t offset, int whence, aio_result_t *resultp) +version SUNW_1.23 +errno EAGAIN EBADF EFAULT EINVAL ENOMEM +exception $return == -1 +end + +function aioread64 +declaration int aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, \ + int whence, aio_result_t *resultp) +arch i386 sparc +version SUNW_1.23 +end + +function aiowait +include <sys/asynch.h>, <aio.h>, <sys/time.h> +declaration aio_result_t *aiowait(struct timeval *timeout) +version SUNW_1.23 +errno EFAULT EINTR EINVAL +exception $return == (aio_result_t *)-1 +end + +function aiowrite +include <sys/types.h>, <sys/asynch.h>, <aio.h> +declaration int aiowrite(int fildes, char *bufp, int bufs, \ + off_t offset, int whence, aio_result_t *resultp) +version SUNW_1.23 +errno EAGAIN EBADF EFAULT EINVAL ENOMEM +exception $return == -1 +end + +function aiowrite64 +include <sys/types.h>, <sys/asynch.h>, <aio.h> +declaration int aiowrite64(int fildes, char *bufp, int bufs, \ + off64_t offset, int whence, aio_result_t *resultp) +arch sparc i386 +version SUNW_1.23 +errno EAGAIN EBADF EFAULT EINVAL ENOMEM +exception $return == -1 +end + +function assfail +declaration int assfail(char *a, char *f, int l) +version SUNW_1.23 +end + diff --git a/usr/src/lib/libc/spec/gen.spec b/usr/src/lib/libc/spec/gen.spec index 9c547e1a37..2b14689e7a 100644 --- a/usr/src/lib/libc/spec/gen.spec +++ b/usr/src/lib/libc/spec/gen.spec @@ -1,7 +1,4 @@ # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# # CDDL HEADER START # # The contents of this file are subject to the terms of the @@ -21,6 +18,10 @@ # # CDDL HEADER END # +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# # ident "%Z%%M% %I% %E% SMI" # @@ -3281,6 +3282,11 @@ errno EPERM exception $return == -1 end +function _sigstack +weak sigstack +version SUNWprivate_1.1 +end + function sleep include <unistd.h> declaration unsigned sleep(unsigned seconds) @@ -4842,19 +4848,6 @@ weak port_alert version SUNWprivate_1.1 end -function port_dispatch -include <port.h> -declaration int port_dispatch(int port, int flags, int source, int events, \ - uintptr_t object, void *user) -version SUNWprivate_1.1 -errno EBADF EBADFD EINTR -end - -function _port_dispatch -weak port_dispatch -version SUNWprivate_1.1 -end - function ucred_size include <ucred.h> declaration size_t ucred_size(void) diff --git a/usr/src/lib/libc/spec/private.spec b/usr/src/lib/libc/spec/private.spec index 2e26e10c8b..9868be02cf 100644 --- a/usr/src/lib/libc/spec/private.spec +++ b/usr/src/lib/libc/spec/private.spec @@ -41,26 +41,6 @@ function __class_quadruple # used by Sun's old Fortran 77 runtime libraries version SUNWprivate_1.1 end -function __clock_getres -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __clock_gettime -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __clock_nanosleep -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __clock_settime -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function __collate_init #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -82,11 +62,6 @@ function __eucpctowc_gen version SUNWprivate_1.1 end -function __fdsync -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function __fgetwc_dense #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -319,11 +294,6 @@ function __multi_innetgr version SUNWprivate_1.1 end -function __nanosleep -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function __nl_langinfo_std #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -364,21 +334,6 @@ function __regfree_std version SUNWprivate_1.1 end -function __signotify -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __sigqueue -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __sigtimedwait -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function __strcoll_C #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -436,31 +391,6 @@ function __time_init version SUNWprivate_1.1 end -function __timer_create -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __timer_delete -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __timer_getoverrun -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __timer_gettime -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function __timer_settime -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function __towctrans_bc #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -1376,11 +1306,6 @@ weak jrand48 version SUNWprivate_1.1 end -function _kaio -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function _l64a # extends libc/spec/gen.spec l64a weak l64a #Declaration /* Unknown. */ @@ -1744,16 +1669,6 @@ weak pthread_atfork version SUNWprivate_1.1 end -function _pthread_attr_clone -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - -function _pthread_attr_equal -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function _pthread_attr_destroy #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -1764,11 +1679,6 @@ function _pthread_attr_getdetachstate version SUNWprivate_1.1 end -function _pthread_attr_getdaemonstate_np -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function _pthread_attr_getinheritsched #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -1814,11 +1724,6 @@ function _pthread_attr_setdetachstate version SUNWprivate_1.1 end -function _pthread_attr_setdaemonstate_np -#Declaration /* Unknown. */ -version SUNWprivate_1.1 -end - function _pthread_attr_setinheritsched #Declaration /* Unknown. */ version SUNWprivate_1.1 @@ -2992,10 +2897,6 @@ arch sparc sparcv9 version SUNWprivate_1.1 end -function kaio -version SUNWprivate_1.1 -end - function makeut version SUNWprivate_1.1 end diff --git a/usr/src/lib/libc/spec/rt.spec b/usr/src/lib/libc/spec/rt.spec new file mode 100644 index 0000000000..52de0469bd --- /dev/null +++ b/usr/src/lib/libc/spec/rt.spec @@ -0,0 +1,641 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +function aio_cancel +include <aio.h> +declaration int aio_cancel(int fildes, struct aiocb *aiocbp) +version SUNW_1.23 +errno EBADF ENOSYS +end + +function aio_fsync +include <aio.h> +declaration int aio_fsync(int op, aiocb_t *aiocbp) +version SUNW_1.23 +errno EAGAIN EBADF EINVAL ENOSYS +end + +function aio_read +include <aio.h> +declaration int aio_read(struct aiocb *aiocbp) +version SUNW_1.23 +errno EAGAIN ENOSYS EBADF EINVAL ECANCELED EFBIG +end + +function aio_write +include <aio.h> +declaration int aio_write(struct aiocb *aiocbp) +version SUNW_1.23 +errno EAGAIN ENOSYS EBADF EINVAL ECANCELED EFBIG +end + +function aio_return +include <aio.h> +declaration ssize_t aio_return(struct aiocb * aiocbp) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function aio_error +include <aio.h> +declaration int aio_error(const struct aiocb *aiocbp) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function aio_suspend +include <aio.h> +declaration int aio_suspend(const struct aiocb *const list[], int nent, \ + const struct timespec *timeout) +version SUNW_1.23 +errno EAGAIN EINTR ENOSYS +end + +function posix_fallocate +include <fcntl.h> +declaration int posix_fallocate(int fd, off_t offset, off_t len) +version SUNW_1.23 +errno EBADF EFBIG EINTR EINVAL EIO ENODEV ENOSPC ESPIPE +end + +function fdatasync +include <unistd.h> +declaration int fdatasync(int fildes) +version SUNW_1.23 +errno EBADF EINVAL ENOSYS +end + +function lio_listio +include <aio.h> +declaration int lio_listio(int mode, struct aiocb *const list[], int nent, \ + struct sigevent *sig) +version SUNW_1.23 +errno EAGAIN EINVAL EINTR EIO ENOSYS ECANCELED \ + EINPROGRESS EOVERFLOW EFBIG +end + +function aio_waitn +include <aio.h> +declaration int aio_waitn(struct aiocb *list[], uint_t nent, \ + uint_t *nwait, const struct timespec *timeout) +version SUNW_1.23 +errno EAGAIN EINTR ETIME ENOMEM EFAULT EINVAL +end + +function aio_cancel64 extends libc/spec/rt.spec aio_cancel +declaration int aio_cancel64(int fildes, struct aiocb64 *aiocbp) +arch i386 sparc +version SUNW_1.23 +end + +function aio_error64 extends libc/spec/rt.spec aio_error +declaration int aio_error64(const struct aiocb64 *aiocbp) +arch i386 sparc +version SUNW_1.23 +end + +function aio_fsync64 extends libc/spec/rt.spec aio_fsync +declaration int aio_fsync64(int op, struct aiocb64 *aiocbp) +arch i386 sparc +version SUNW_1.23 +end + +function aio_read64 extends libc/spec/rt.spec aio_read +declaration int aio_read64(struct aiocb64 *aiocbp) +arch i386 sparc +version SUNW_1.23 +end + +function aio_return64 extends libc/spec/rt.spec aio_return +declaration ssize_t aio_return64(struct aiocb64 * aiocbp) +arch i386 sparc +version SUNW_1.23 +end + +function aio_suspend64 extends libc/spec/rt.spec aio_suspend +declaration int aio_suspend64(const struct aiocb64 *const list[], \ + int nent, const struct timespec *timeout) +arch i386 sparc +version SUNW_1.23 +end + +function aio_write64 extends libc/spec/rt.spec aio_write +declaration int aio_write64(struct aiocb64 *aiocbp) +arch i386 sparc +version SUNW_1.23 +end + +function lio_listio64 extends libc/spec/rt.spec lio_listio +declaration int lio_listio64(int mode, struct aiocb64 *const list[], \ + int nent, struct sigevent *sig) +arch i386 sparc +version SUNW_1.23 +end + +function aio_waitn64 extends libc/spec/rt.spec aio_waitn +declaration int aio_waitn64(struct aiocb64 *list[], uint_t nent, \ + uint_t *nwait, const struct timespec *timeout) +arch i386 sparc +version SUNW_1.23 +end + +function posix_fallocate64 extends libc/spec/rt.spec posix_fallocate +declaration int posix_fallocate64(int fd, off64_t offset, off64_t len) +arch i386 sparc +version SUNW_1.23 +end + +function mq_close +include <mqueue.h> +declaration int mq_close(mqd_t mqdes) +version SUNW_1.23 +errno EBADF ENOSYS +exception $return == -1 +end + +function mq_notify +include <mqueue.h> +declaration int mq_notify(mqd_t mqdes, const struct sigevent *notification) +version SUNW_1.23 +errno EBADF EBUSY ENOSYS +exception $return == -1 +end + +function mq_open +include <mqueue.h> +declaration mqd_t mq_open(const char *name, int oflag, ...) +version SUNW_1.23 +errno EACCESS EEXIST EINTR EINVAL EMFILE ENAMETOOLONG ENFILE \ + ENOENT ENOSPC ENOSYS +exception $return == (mqd_t)(-1) +end + +function mq_receive +include <mqueue.h> +declaration ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, \ + size_t msg_len, unsigned int *msg_prio) +version SUNW_1.23 +errno EAGAIN EBADF EMSGSIZE EINTR +exception $return == (ssize_t)(-1) +end + +function mq_timedreceive +include <mqueue.h>, <time.h> +declaration ssize_t mq_timedreceive(mqd_t mqdes, char *msg_ptr, \ + size_t msg_len, unsigned int *msg_prio, \ + const struct timespec *abs_timeout) +version SUNW_1.23 +errno EAGAIN EBADF EMSGSIZE EINTR ETIMEDOUT +exception $return == (ssize_t)(-1) +end + +function mq_reltimedreceive_np +include <mqueue.h>, <time.h> +declaration ssize_t mq_reltimedreceive_np(mqd_t mqdes, char *msg_ptr, \ + size_t msg_len, unsigned int *msg_prio, \ + const struct timespec *rel_timeout) +version SUNW_1.23 +errno EAGAIN EBADF EMSGSIZE EINTR ETIMEDOUT +exception $return == (ssize_t)(-1) +end + +function mq_send +include <mqueue.h> +declaration int mq_send(mqd_t mqdes, const char *msg_ptr, \ + size_t msg_len, unsigned int msg_prio) +version SUNW_1.23 +errno EAGAIN EBADF EINTR EMSGSIZE +exception $return == -1 +end + +function mq_timedsend +include <mqueue.h>, <time.h> +declaration int mq_timedsend(mqd_t mqdes, const char *msg_ptr, \ + size_t msg_len, unsigned int msg_prio, \ + const struct timespec *abs_timeout) +version SUNW_1.23 +errno EAGAIN EBADF EINTR EMSGSIZE ETIMEDOUT +exception $return == -1 +end + +function mq_reltimedsend_np +include <mqueue.h>, <time.h> +declaration int mq_reltimedsend_np(mqd_t mqdes, const char *msg_ptr, \ + size_t msg_len, unsigned int msg_prio, \ + const struct timespec *rel_timeout) +version SUNW_1.23 +errno EAGAIN EBADF EINTR EMSGSIZE ETIMEDOUT +exception $return == -1 +end + +function mq_setattr +include <mqueue.h> +declaration int mq_setattr(mqd_t mqdes, \ + const struct mq_attr *_RESTRICT_KYWD mqstat, \ + struct mq_attr *_RESTRICT_KYWD omqstat) +version SUNW_1.23 +errno EBADF ENOSYS +exception $return == -1 +end + +function mq_getattr +include <mqueue.h> +declaration int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat) +version SUNW_1.23 +errno EBADF ENOSYS +exception $return == -1 +end + +function mq_unlink +include <mqueue.h> +declaration int mq_unlink(const char *name) +version SUNW_1.23 +errno EACCESS ENAMETOOLONG ENOENT ENOSYS +exception $return == -1 +end + +function nanosleep +include <time.h> +declaration int nanosleep(const struct timespec *rqtp, \ + struct timespec *rmtp) +version SUNW_1.23 +errno EINTR EINVAL +end + +function clock_nanosleep +include <time.h> +declaration int clock_nanosleep(clockid_t clock_id, int flags, \ + const struct timespec *rqtp, struct timespec *rmtp) +version SUNW_1.23 +errno EINTR EINVAL +end + +function sched_get_priority_max +include <sched.h> +declaration int sched_get_priority_max(int policy) +version SUNW_1.23 +errno EINVAL ENOSYS ESRCH +end + +function sched_get_priority_min +include <sched.h> +declaration int sched_get_priority_min(int policy) +version SUNW_1.23 +errno EINVAL ENOSYS ESRCH +end + +function sched_rr_get_interval +include <sched.h> +declaration int sched_rr_get_interval(pid_t pid, struct timespec *interval) +version SUNW_1.23 +errno EINVAL ENOSYS ESRCH +end + +function sched_setparam +include <sched.h> +declaration int sched_setparam(pid_t pid, const struct sched_param *param) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM ESRCH +end + +function sched_getparam +include <sched.h> +declaration int sched_getparam(pid_t pid, struct sched_param *param) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM ESRCH +end + +function sched_setscheduler +include <sched.h> +declaration int sched_setscheduler(pid_t pid, int policy, \ + const struct sched_param *param) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM ESRCH +end + +function sched_getscheduler +include <sched.h> +declaration int sched_getscheduler(pid_t pid) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM ESRCH +end + +function sched_yield +include <sched.h> +declaration int sched_yield(void) +version SUNW_1.23 +errno ENOSYS +end + +function sem_close +include <semaphore.h> +declaration int sem_close(sem_t *sem) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function sem_destroy +include <semaphore.h> +declaration int sem_destroy(sem_t *sem) +version SUNW_1.23 +errno EINVAL ENOSYS EBUSY +end + +function sem_getvalue +include <semaphore.h> +declaration int sem_getvalue(sem_t *sem, int *sval) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function sem_init +include <semaphore.h>, <unistd.h> +declaration int sem_init(sem_t *sem, int pshared, unsigned int value) +version SUNW_1.23 +errno EINVAL ENOSPC ENOSYS EPERM +end + +function sem_open +include <semaphore.h>, <unistd.h>, <sys/stat.h> +declaration sem_t *sem_open(const char *name, int oflag, ...) +version SUNW_1.23 +errno EACCES EEXIST EINTR EINVAL EMFILE ENAMETOOLONG ENFILE \ + ENOENT ENOSPC ENOSYS +end + +function sem_post +include <semaphore.h> +declaration int sem_post(sem_t *sem) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function sem_unlink +include <semaphore.h> +declaration int sem_unlink(const char *name) +version SUNW_1.23 +errno EACCES ENAMETOOLONG ENOENT ENOSYS +end + +function sem_wait +include <semaphore.h> +declaration int sem_wait(sem_t *sem) +version SUNW_1.23 +errno EAGAIN EINVAL EINTR ENOSYS EDEADLK +end + +function sem_timedwait +include <semaphore.h> <time.h> +declaration int sem_timedwait(sem_t *sem, const timespec_t *abstime) +version SUNW_1.23 +errno EAGAIN EINVAL EINTR ETIMEDOUT EDEADLK +end + +function sem_reltimedwait_np +include <semaphore.h> <time.h> +declaration int sem_reltimedwait_np(sem_t *sem, const timespec_t *reltime) +version SUNW_1.23 +errno EAGAIN EINVAL EINTR ETIMEDOUT EDEADLK +end + +function sem_trywait +include <semaphore.h> +declaration int sem_trywait(sem_t *sem) +version SUNW_1.23 +errno EAGAIN EINVAL EINTR ENOSYS EDEADLK +end + +function shm_open +include <sys/mman.h>, <sys/types.h>, <sys/stat.h>, <fcntl.h> +declaration int shm_open(const char *name, int oflag, mode_t mode) +version SUNW_1.23 +errno EACCES EEXIST EINTR EINVAL EMFILE ENAMETOOLONG ENFILE \ + ENOENT ENOSPC ENOSYS +end + +function shm_unlink +declaration int shm_unlink(const char *name) +version SUNW_1.23 +errno EACCES ENAMETOOLONG ENOENT ENOSYS +end + +function sigqueue +include <signal.h> +declaration int sigqueue(pid_t pid, int signo, const union sigval value) +version SUNW_1.23 +errno EAGAIN EINVAL ENOSYS EPERM ESRCH +end + +function sigwaitinfo +include <signal.h> +declaration int sigwaitinfo(const sigset_t *_RESTRICT_KYWD set, \ + siginfo_t *_RESTRICT_KYWD info) +version SUNW_1.23 +errno EINTR ENOSYS EAGAIN EINVAL +end + +function sigtimedwait +include <signal.h> +declaration int sigtimedwait(const sigset_t *_RESTRICT_KYWD set, \ + siginfo_t *_RESTRICT_KYWD info, \ + const struct timespec *_RESTRICT_KYWD timeout) +version SUNW_1.23 +errno EINTR ENOSYS EAGAIN EINVAL +end + +function timer_create +include <signal.h>, <time.h> +declaration int timer_create(clockid_t clock_id, struct sigevent *evp, \ + timer_t *timerid) +version SUNW_1.23 +errno EAGAIN EINVAL ENOSYS +end + +function timer_delete +include <time.h> +declaration int timer_delete(timer_t timerid) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function timer_settime +include <time.h> +declaration int timer_settime(timer_t timerid, int flags, \ + const struct itimerspec *value, \ + struct itimerspec *ovalue) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function timer_gettime +include <time.h> +declaration int timer_gettime(timer_t timerid, struct itimerspec *value) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function timer_getoverrun +include <time.h> +declaration int timer_getoverrun(timer_t timerid) +version SUNW_1.23 +errno EINVAL ENOSYS +end + +function clock_settime +include <time.h> +declaration int clock_settime(clockid_t clock_id, const struct timespec *tp) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM +end + +function clock_gettime +include <time.h> +declaration int clock_gettime(clockid_t clock_id, struct timespec *tp) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM +end + +function clock_getres +include <time.h> +declaration int clock_getres(clockid_t clock_id, struct timespec *res) +version SUNW_1.23 +errno EINVAL ENOSYS EPERM +end + +function _clock_getres +version SUNWprivate_1.1 +end + +function _clock_gettime +version SUNWprivate_1.1 +end + +function _clock_settime +version SUNWprivate_1.1 +end + +function _nanosleep +version SUNWprivate_1.1 +end + +function _clock_nanosleep +version SUNWprivate_1.1 +end + +function _timer_create +version SUNWprivate_1.1 +end + +function _timer_delete +version SUNWprivate_1.1 +end + +function _timer_getoverrun +version SUNWprivate_1.1 +end + +function _timer_gettime +version SUNWprivate_1.1 +end + +function _timer_settime +version SUNWprivate_1.1 +end + +# +# Weak Specs +# +function _sem_open +weak sem_open +version SUNWprivate_1.1 +end + +function _sem_close +weak sem_close +version SUNWprivate_1.1 +end + +function _sem_unlink +weak sem_unlink +version SUNWprivate_1.1 +end + +function _sem_init +weak sem_init +version SUNWprivate_1.1 +end + +function _sem_destroy +weak sem_destroy +version SUNWprivate_1.1 +end + +function _sem_wait +weak sem_wait +version SUNWprivate_1.1 +end + +function _sem_timedwait +weak sem_timedwait +version SUNWprivate_1.1 +end + +function _sem_reltimedwait_np +weak sem_reltimedwait_np +version SUNWprivate_1.1 +end + +function _sem_trywait +weak sem_trywait +version SUNWprivate_1.1 +end + +function _sem_post +weak sem_post +version SUNWprivate_1.1 +end + +function _sem_getvalue +weak sem_getvalue +version SUNWprivate_1.1 +end + +function _sigwaitinfo +weak sigwaitinfo +version SUNWprivate_1.1 +end + +function _sigtimedwait +weak sigtimedwait +version SUNWprivate_1.1 +end + +function _sigqueue +weak sigqueue +version SUNWprivate_1.1 +end + diff --git a/usr/src/lib/libc/spec/sys.spec b/usr/src/lib/libc/spec/sys.spec index e780453a1d..89aa86beb9 100644 --- a/usr/src/lib/libc/spec/sys.spec +++ b/usr/src/lib/libc/spec/sys.spec @@ -1,3 +1,4 @@ +# # CDDL HEADER START # # The contents of this file are subject to the terms of the @@ -17,6 +18,7 @@ # # CDDL HEADER END # +# # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -2377,11 +2379,6 @@ version sparc=SYSVABI_1.3 i386=SYSVABI_1.3 sparcv9=SUNW_0.7 \ binding nodirect end -function _libc_sigaction -weak sigaction -version SUNWprivate_1.1 -end - function sigaltstack include <signal.h> declaration int sigaltstack(const stack_t *_RESTRICT_KYWD ss, \ diff --git a/usr/src/lib/libc/spec/threads.spec b/usr/src/lib/libc/spec/threads.spec index 1bd84cfbeb..21e22d308a 100644 --- a/usr/src/lib/libc/spec/threads.spec +++ b/usr/src/lib/libc/spec/threads.spec @@ -1,4 +1,6 @@ # +# CDDL HEADER START +# # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. @@ -951,14 +953,6 @@ arch i386 version i386=SUNWprivate_1.1 end -function _cancel_prologue -version SUNWprivate_1.1 -end - -function _cancel_epilogue -version SUNWprivate_1.1 -end - function _sigoff version SUNWprivate_1.1 end @@ -967,10 +961,6 @@ function _sigon version SUNWprivate_1.1 end -function _sigdeferred -version SUNWprivate_1.1 -end - function _thr_detach version SUNWprivate_1.1 end |