summaryrefslogtreecommitdiff
path: root/usr/src/lib/libaio
diff options
context:
space:
mode:
authorstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
committerstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
commit7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
treec871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/lib/libaio
downloadillumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz
OpenSolaris Launch
Diffstat (limited to 'usr/src/lib/libaio')
-rw-r--r--usr/src/lib/libaio/Makefile79
-rw-r--r--usr/src/lib/libaio/Makefile.com67
-rw-r--r--usr/src/lib/libaio/amd64/Makefile32
-rw-r--r--usr/src/lib/libaio/asynch.h79
-rw-r--r--usr/src/lib/libaio/common/Makefile49
-rw-r--r--usr/src/lib/libaio/common/aio.c2252
-rw-r--r--usr/src/lib/libaio/common/libaio.h339
-rw-r--r--usr/src/lib/libaio/common/llib-laio84
-rw-r--r--usr/src/lib/libaio/common/ma.c60
-rw-r--r--usr/src/lib/libaio/common/posix_aio.c1720
-rw-r--r--usr/src/lib/libaio/common/scalls.c74
-rw-r--r--usr/src/lib/libaio/common/sig.c301
-rw-r--r--usr/src/lib/libaio/common/subr.c61
-rw-r--r--usr/src/lib/libaio/i386/Makefile31
-rw-r--r--usr/src/lib/libaio/sparc/Makefile31
-rw-r--r--usr/src/lib/libaio/sparcv9/Makefile32
-rw-r--r--usr/src/lib/libaio/spec/Makefile30
-rw-r--r--usr/src/lib/libaio/spec/Makefile.targ37
-rw-r--r--usr/src/lib/libaio/spec/aio.spec208
-rw-r--r--usr/src/lib/libaio/spec/amd64/Makefile44
-rw-r--r--usr/src/lib/libaio/spec/i386/Makefile44
-rw-r--r--usr/src/lib/libaio/spec/sparc/Makefile44
-rw-r--r--usr/src/lib/libaio/spec/sparcv9/Makefile45
-rw-r--r--usr/src/lib/libaio/spec/versions55
24 files changed, 5798 insertions, 0 deletions
diff --git a/usr/src/lib/libaio/Makefile b/usr/src/lib/libaio/Makefile
new file mode 100644
index 0000000000..cb0baf37ea
--- /dev/null
+++ b/usr/src/lib/libaio/Makefile
@@ -0,0 +1,79 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 1997-2003 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+include ../Makefile.lib
+
+SUBDIRS = $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET= all
+clean := TARGET= clean
+clobber := TARGET= clobber
+install := TARGET= install
+lint := TARGET= lint
+
+.KEEP_STATE:
+
+all clean clobber install: spec .WAIT $(SUBDIRS)
+
+lint: $(SUBDIRS)
+
+LIBRARY= libaio.a
+XGETFLAGS= -a
+POFILE= $(LIBRARY:.a=.po)
+POFILES= generic.po
+
+# definitions for install_h target
+HDRS= asynch.h
+HDRDIR= .
+ROOTHDRDIR= $(ROOT)/usr/include/sys
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+spec $(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+_msg: $(MSGDOMAIN) $(POFILE)
+ $(RM) $(MSGDOMAIN)/$(POFILE)
+ $(CP) $(POFILE) $(MSGDOMAIN)
+
+$(POFILE): $(POFILES)
+ $(RM) $@
+ $(CAT) $(POFILES) > $@
+
+$(POFILES):
+ $(RM) messages.po
+ $(XGETTEXT) $(XGETFLAGS) *.[ch]* */*.[ch]*
+ $(SED) -e '/^# msg/d' -e '/^domain/d' messages.po > $@
+ $(RM) messages.po
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/lib/libaio/Makefile.com b/usr/src/lib/libaio/Makefile.com
new file mode 100644
index 0000000000..21f50e208b
--- /dev/null
+++ b/usr/src/lib/libaio/Makefile.com
@@ -0,0 +1,67 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+LIBRARY= libaio.a
+VERS= .1
+
+COBJS= aio.o \
+ posix_aio.o \
+ scalls.o \
+ sig.o \
+ subr.o \
+ ma.o
+
+OBJECTS= $(COBJS) $(MOBJS)
+
+include ../../Makefile.lib
+include ../../Makefile.rootfs
+
+SRCS= $(COBJS:%.o=../common/%.c)
+
+LIBS = $(DYNLIB) $(LINTLIB)
+LDLIBS += -lc
+$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC)
+
+SRCDIR = ../common
+MAPDIR = ../spec/$(TRANSMACH)
+SPECMAPFILE = $(MAPDIR)/mapfile
+
+CFLAGS += $(CCVERBOSE)
+CPPFLAGS += -I. -Iinc -I.. -I$(SRCDIR)
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+include ../../Makefile.targ
+
+pics/%.o: $(MDIR)/%.s
+ $(BUILD.s)
+ $(POST_PROCESS_O)
diff --git a/usr/src/lib/libaio/amd64/Makefile b/usr/src/lib/libaio/amd64/Makefile
new file mode 100644
index 0000000000..cb39a2beff
--- /dev/null
+++ b/usr/src/lib/libaio/amd64/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libaio/asynch.h b/usr/src/lib/libaio/asynch.h
new file mode 100644
index 0000000000..48dc3dc622
--- /dev/null
+++ b/usr/src/lib/libaio/asynch.h
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1991 by Sun Microsystems, Inc.
+ */
+
+#ifndef _SYS_ASYNCH_H
+#define _SYS_ASYNCH_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/feature_tests.h>
+#include <sys/types.h>
+#include <sys/aio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AIO_INPROGRESS -2 /* values not set by the system */
+
+/* large file compilation environment setup */
+#if !defined(_LP64) && _FILE_OFFSET_BITS == 64
+#ifdef __PRAGMA_REDEFINE_EXTNAME
+#pragma redefine_extname aioread aioread64
+#pragma redefine_extname aiowrite aiowrite64
+#else
+#define aioread aioread64
+#define aiowrite aiowrite64
+#endif
+#endif /* _FILE_OFFSET_BITS */
+
+#if defined(_LP64) && defined(_LARGEFILE64_SOURCE)
+#ifdef __PRAGMA_REDEFINE_EXTNAME
+#pragma redefine_extname aioread64 aioread
+#pragma redefine_extname aiowrite64 aiowrite
+#else
+#define aioread64 aioread
+#define aiowrite64 aiowrite
+#endif
+#endif /* _LP64 && _LARGEFILE64_SOURCE */
+extern int aioread(int, caddr_t, int, off_t, int, aio_result_t *);
+extern int aiowrite(int, caddr_t, int, off_t, int, aio_result_t *);
+extern int aiocancel(aio_result_t *);
+extern aio_result_t *aiowait(struct timeval *);
+
+/* transitional large file interfaces */
+#if defined(_LARGEFILE64_SOURCE) && !((_FILE_OFFSET_BITS == 64) && \
+ !defined(__PRAGMA_REDEFINE_EXTNAME))
+extern int aioread64(int, caddr_t, int, off64_t, int, aio_result_t *);
+extern int aiowrite64(int, caddr_t, int, off64_t, int, aio_result_t *);
+#endif /* _LARGEFILE64_SOURCE... */
+
+#define MAXASYNCHIO 200 /* maxi.number of outstanding i/o's */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ASYNCH_H */
diff --git a/usr/src/lib/libaio/common/Makefile b/usr/src/lib/libaio/common/Makefile
new file mode 100644
index 0000000000..5a58f96bc0
--- /dev/null
+++ b/usr/src/lib/libaio/common/Makefile
@@ -0,0 +1,49 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+# lib/libaio/common/Makefile
+
+LINTSRC32= lintsrc32
+LINTOUT32= lint32.out
+LINTLIB32= $(LIBNAME)32
+$(LINTSRC32):= LINTFLAGS +=
+
+LINTSRC64= lintsrc64
+LINTOUT64= lint64.out
+LINTLIB64= $(LIBNAME)64
+$(LINTSRC64):= LINTFLAGS64 += -fd -Xtransition=yes
+
+lints : $(LINTSRC32) $(LINTSRC64)
+
+$(LINTSRC32): $$(SRCS)
+ $(LINT.c) -o $(LINTLIB32) $(SRCS) > $(LINTOUT32) 2>&1
+
+$(LINTSRC64): $$(SRCS)
+ $(LINT64.c) -o $(LINTLIB64) $(SRCS) > $(LINTOUT64) 2>&1
+
+include ../Makefile.com
+
diff --git a/usr/src/lib/libaio/common/aio.c b/usr/src/lib/libaio/common/aio.c
new file mode 100644
index 0000000000..34f66f8824
--- /dev/null
+++ b/usr/src/lib/libaio/common/aio.c
@@ -0,0 +1,2252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "libaio.h"
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/port.h>
+
+static int _aio_hash_insert(aio_result_t *, aio_req_t *);
+static aio_req_t *_aio_req_alloc(void);
+static aio_req_t *_aio_req_get(aio_worker_t *);
+static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
+static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
+static aio_result_t *_aio_req_done(void);
+static void _aio_work_done(aio_worker_t *);
+aio_req_t *_aio_req_remove(aio_req_t *reqp);
+static void _aio_enq_doneq(aio_req_t *reqp);
+int _aio_get_timedelta(struct timespec *end, struct timespec *wait);
+
+aio_req_t *_aio_hash_find(aio_result_t *);
+void _aio_req_free(aio_req_t *);
+void _aio_lock(void);
+void _aio_unlock(void);
+
+extern int __fdsync(int fd, int mode);
+extern int _sigprocmask(int, const sigset_t *, sigset_t *);
+extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
+
+static int _aio_fsync_del(aio_req_t *, aio_lio_t *);
+static int _aiodone(aio_req_t *, aio_lio_t *, int, ssize_t, int);
+static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
+
+#ifdef DEBUG
+void _aio_stats(void);
+#endif
+
+int _pagesize;
+
+#define AIOREQSZ (sizeof (struct aio_req))
+#define AIOCLICKS ((_pagesize)/AIOREQSZ)
+#define HASHSZ 8192L /* power of 2 */
+#define AIOHASH(resultp) ((((uintptr_t)(resultp) >> 13) ^ \
+ ((uintptr_t)(resultp))) & (HASHSZ-1))
+#define POSIX_AIO(x) ((x)->req_type == AIO_POSIX_REQ)
+
+/*
+ * switch for kernel async I/O
+ */
+int _kaio_ok = 0; /* 0 = disabled, 1 = on, -1 = error */
+
+/*
+ * Key for thread-specific data
+ */
+thread_key_t _aio_key = 0;
+
+/*
+ * Array for determining whether or not a file supports kaio
+ */
+uint32_t _kaio_supported[MAX_KAIO_FDARRAY_SIZE];
+
+int _aioreqsize = AIOREQSZ;
+
+#ifdef DEBUG
+int *_donecnt; /* per worker AIO done count */
+int *_idlecnt; /* per worker idle count */
+int *_qfullcnt; /* per worker full q count */
+int *_firstqcnt; /* num times queue one is used */
+int *_newworker; /* num times new worker is created */
+int _clogged = 0; /* num times all queues are locked */
+int _qlocked = 0; /* num times submitter finds q locked */
+int _aio_submitcnt = 0;
+int _aio_submitcnt2 = 0;
+int _submitcnt = 0;
+int _avesubmitcnt = 0;
+int _aiowaitcnt = 0;
+int _startaiowaitcnt = 1;
+int _avedone = 0;
+int _new_workers = 0;
+#endif
+
+/*
+ * workers for read requests.
+ * (__aio_mutex lock protects circular linked list of workers.)
+ */
+aio_worker_t *__workers_rd; /* circular list of AIO workers */
+aio_worker_t *__nextworker_rd; /* next worker in list of workers */
+int __rd_workerscnt; /* number of read workers */
+
+/*
+ * workers for write requests.
+ * (__aio_mutex lock protects circular linked list of workers.)
+ */
+aio_worker_t *__workers_wr; /* circular list of AIO workers */
+aio_worker_t *__nextworker_wr; /* next worker in list of workers */
+int __wr_workerscnt; /* number of write workers */
+
+/*
+ * worker for sigevent requests.
+ */
+aio_worker_t *__workers_si; /* circular list of AIO workers */
+aio_worker_t *__nextworker_si; /* next worker in list of workers */
+int __si_workerscnt; /* number of write workers */
+
+struct aio_req *_aio_done_tail; /* list of done requests */
+struct aio_req *_aio_done_head;
+
+mutex_t __aio_initlock = DEFAULTMUTEX; /* makes aio initialization atomic */
+mutex_t __aio_mutex = DEFAULTMUTEX; /* protects counts, and linked lists */
+mutex_t __aio_cachefillock = DEFAULTMUTEX; /* single-thread aio cache filling */
+cond_t _aio_iowait_cv = DEFAULTCV; /* wait for userland I/Os */
+cond_t __aio_cachefillcv = DEFAULTCV; /* sleep cv for cache filling */
+
+mutex_t __lio_mutex = DEFAULTMUTEX; /* protects lio lists */
+
+int __aiostksz; /* aio worker's stack size */
+int __aio_cachefilling = 0; /* set when aio cache is filling */
+int __sigio_masked = 0; /* bit mask for SIGIO signal */
+int __sigio_maskedcnt = 0; /* mask count for SIGIO signal */
+pid_t __pid = (pid_t)-1; /* initialize as invalid pid */
+static struct aio_req **_aio_hash;
+static struct aio_req *_aio_freelist;
+static struct aio_req *_aio_doneq; /* double linked done queue list */
+static int _aio_freelist_cnt;
+
+static struct sigaction act;
+
+cond_t _aio_done_cv = DEFAULTCV;
+
+/*
+ * Input queue of requests which is serviced by the aux. threads.
+ */
+cond_t _aio_idle_cv = DEFAULTCV;
+
+int _aio_cnt = 0;
+int _aio_donecnt = 0;
+int _aio_waitncnt = 0; /* # fs requests for aio_waitn */
+int _aio_doneq_cnt = 0;
+int _aio_outstand_cnt = 0; /* number of outstanding requests */
+int _aio_outstand_waitn = 0; /* # of queued requests for aio_waitn */
+int _aio_req_done_cnt = 0; /* req. done but not in "done queue" */
+int _aio_kernel_suspend = 0; /* active kernel kaio calls */
+int _aio_suscv_cnt = 0; /* aio_suspend calls waiting on cv's */
+
+int _max_workers = 256; /* max number of workers permitted */
+int _min_workers = 8; /* min number of workers */
+int _maxworkload = 32; /* max length of worker's request q */
+int _minworkload = 2; /* min number of request in q */
+int _aio_worker_cnt = 0; /* number of workers to do requests */
+int _idle_workers = 0; /* number of idle workers */
+int __uaio_ok = 0; /* AIO has been enabled */
+sigset_t _worker_set; /* worker's signal mask */
+
+int _aiowait_flag = 0; /* when set, aiowait() is inprogress */
+int _aio_flags = 0; /* see libaio.h defines for */
+
+struct aio_worker *_kaiowp; /* points to kaio cleanup thread */
+
+/*
+ * called by the child when the main thread forks. the child is
+ * cleaned up so that it can use libaio.
+ */
+void
+_aio_forkinit(void)
+{
+ __uaio_ok = 0;
+ __workers_rd = NULL;
+ __nextworker_rd = NULL;
+ __workers_wr = NULL;
+ __nextworker_wr = NULL;
+ _aio_done_tail = NULL;
+ _aio_done_head = NULL;
+ _aio_hash = NULL;
+ _aio_freelist = NULL;
+ _aio_freelist_cnt = 0;
+ _aio_doneq = NULL;
+ _aio_doneq_cnt = 0;
+ _aio_waitncnt = 0;
+ _aio_outstand_cnt = 0;
+ _aio_outstand_waitn = 0;
+ _aio_req_done_cnt = 0;
+ _aio_kernel_suspend = 0;
+ _aio_suscv_cnt = 0;
+ _aio_flags = 0;
+ _aio_worker_cnt = 0;
+ _idle_workers = 0;
+ _kaio_ok = 0;
+#ifdef DEBUG
+ _clogged = 0;
+ _qlocked = 0;
+#endif
+}
+
+#ifdef DEBUG
+/*
+ * print out a bunch of interesting statistics when the process
+ * exits.
+ */
+void
+_aio_stats()
+{
+ int i;
+ char *fmt;
+ int cnt;
+ FILE *fp;
+
+ fp = fopen("/tmp/libaio.log", "w+a");
+ if (fp == NULL)
+ return;
+ fprintf(fp, "size of AIO request struct = %d bytes\n", _aioreqsize);
+ fprintf(fp, "number of AIO workers = %d\n", _aio_worker_cnt);
+ cnt = _aio_worker_cnt + 1;
+ for (i = 2; i <= cnt; i++) {
+ fmt = "%d done %d, idle = %d, qfull = %d, newworker = %d\n";
+ fprintf(fp, fmt, i, _donecnt[i], _idlecnt[i], _qfullcnt[i],
+ _newworker[i]);
+ }
+ fprintf(fp, "num times submitter found next work queue locked = %d\n",
+ _qlocked);
+ fprintf(fp, "num times submitter found all work queues locked = %d\n",
+ _clogged);
+ fprintf(fp, "average submit request = %d\n", _avesubmitcnt);
+ fprintf(fp, "average number of submit requests per new worker = %d\n",
+ _avedone);
+}
+#endif
+
+/*
+ * libaio is initialized when an AIO request is made. important
+ * constants are initialized like the max number of workers that
+ * libaio can create, and the minimum number of workers permitted before
+ * imposing some restrictions. also, some workers are created.
+ */
+int
+__uaio_init(void)
+{
+ int i;
+ size_t size;
+ extern sigset_t __sigiomask;
+ struct sigaction oact;
+
+ (void) mutex_lock(&__aio_initlock);
+ if (_aio_key == 0 &&
+ thr_keycreate(&_aio_key, _aio_free_worker) != 0)
+ _aiopanic("__uaio_init, thr_keycreate()\n");
+ if (!__uaio_ok) {
+ __pid = getpid();
+
+ if (_sigaction(SIGAIOCANCEL, NULL, &oact) == -1) {
+ (void) mutex_unlock(&__aio_initlock);
+ return (-1);
+ }
+
+ if (oact.sa_handler != aiosigcancelhndlr) {
+ act.sa_handler = aiosigcancelhndlr;
+ act.sa_flags = SA_SIGINFO;
+ if (_sigaction(SIGAIOCANCEL, &act, &sigcanact) == -1) {
+ (void) mutex_unlock(&__aio_initlock);
+ return (-1);
+ }
+ }
+
+ /*
+ * Constant sigiomask, used by _aiosendsig()
+ */
+ (void) sigaddset(&__sigiomask, SIGIO);
+#ifdef DEBUG
+ size = _max_workers * (sizeof (int) * 5 +
+ sizeof (int));
+ _donecnt = malloc(size);
+ (void) memset((caddr_t)_donecnt, 0, size);
+ _idlecnt = _donecnt + _max_workers;
+ _qfullcnt = _idlecnt + _max_workers;
+ _firstqcnt = _qfullcnt + _max_workers;
+ _newworker = _firstqcnt + _max_workers;
+ atexit(_aio_stats);
+#endif
+ size = HASHSZ * sizeof (struct aio_req *);
+ _aio_hash = malloc(size);
+ if (_aio_hash == NULL) {
+ (void) mutex_unlock(&__aio_initlock);
+ return (-1);
+ }
+ (void) memset((caddr_t)_aio_hash, 0, size);
+
+ /* initialize worker's signal mask to only catch SIGAIOCANCEL */
+ (void) sigfillset(&_worker_set);
+ (void) sigdelset(&_worker_set, SIGAIOCANCEL);
+
+ /*
+ * Create equal number of READ and WRITE workers.
+ */
+ i = 0;
+ while (i++ < (_min_workers/2))
+ (void) _aio_create_worker(NULL, AIOREAD);
+ i = 0;
+ while (i++ < (_min_workers/2))
+ (void) _aio_create_worker(NULL, AIOWRITE);
+
+ /* create one worker to send completion signals. */
+ (void) _aio_create_worker(NULL, AIOSIGEV);
+ (void) mutex_unlock(&__aio_initlock);
+ __uaio_ok = 1;
+ return (0);
+ }
+
+ (void) mutex_unlock(&__aio_initlock);
+ return (0);
+}
+
+/*
+ * special kaio cleanup thread sits in a loop in the
+ * kernel waiting for pending kaio requests to complete.
+ */
+void *
+_kaio_cleanup_thread(void *arg)
+{
+ if (thr_setspecific(_aio_key, arg) != 0)
+ _aiopanic("_kaio_cleanup_thread, thr_setspecific()\n");
+ (void) _kaio(AIOSTART);
+ return (arg);
+}
+
+/*
+ * initialize kaio.
+ */
+void
+_kaio_init()
+{
+ int error;
+ sigset_t set, oset;
+
+ (void) mutex_lock(&__aio_initlock);
+ if (_aio_key == 0 &&
+ thr_keycreate(&_aio_key, _aio_free_worker) != 0)
+ _aiopanic("_kaio_init, thr_keycreate()\n");
+ if (!_kaio_ok) {
+ _pagesize = (int)PAGESIZE;
+ __aiostksz = 8 * _pagesize;
+ if ((_kaiowp = _aio_alloc_worker()) == NULL) {
+ error = ENOMEM;
+ } else {
+ if ((error = (int)_kaio(AIOINIT)) == 0) {
+ (void) sigfillset(&set);
+ (void) _sigprocmask(SIG_SETMASK, &set, &oset);
+ error = thr_create(NULL, __aiostksz,
+ _kaio_cleanup_thread, _kaiowp,
+ THR_BOUND | THR_DAEMON, &_kaiowp->work_tid);
+ (void) _sigprocmask(SIG_SETMASK, &oset, NULL);
+ }
+ if (error) {
+ _aio_free_worker(_kaiowp);
+ _kaiowp = NULL;
+ }
+ }
+ if (error)
+ _kaio_ok = -1;
+ else
+ _kaio_ok = 1;
+ }
+ (void) mutex_unlock(&__aio_initlock);
+}
+
+int
+aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
+}
+
+int
+aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
+}
+
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+int
+aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
+}
+
+int
+aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
+ aio_result_t *resultp)
+{
+ return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
+}
+#endif /* (_LARGEFILE64_SOURCE) && !defined(_LP64) */
+
+int
+_aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
+ aio_result_t *resultp, int mode)
+{
+ aio_worker_t **nextworker;
+ aio_req_t *aiorp = NULL;
+ aio_args_t *ap = NULL;
+ offset_t loffset = 0;
+ struct stat stat;
+ int err = 0;
+ int kerr;
+ int umode;
+
+ switch (whence) {
+
+ case SEEK_SET:
+ loffset = offset;
+ break;
+ case SEEK_CUR:
+ if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
+ err = -1;
+ else
+ loffset += offset;
+ break;
+ case SEEK_END:
+ if (fstat(fd, &stat) == -1)
+ err = -1;
+ else
+ loffset = offset + stat.st_size;
+ break;
+ default:
+ errno = EINVAL;
+ err = -1;
+ }
+
+ if (err)
+ return (err);
+
+ /* initialize kaio */
+ if (!_kaio_ok)
+ _kaio_init();
+
+ /*
+ * _aio_do_request() needs the original request code (mode) to be able
+ * to choose the appropiate 32/64 bit function. All other functions
+ * only require the difference between READ and WRITE (umode).
+ */
+ if (mode == AIOAREAD64 || mode == AIOAWRITE64)
+ umode = mode - AIOAREAD64;
+ else
+ umode = mode;
+
+ /*
+ * Try kernel aio first.
+ * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
+ */
+ if ((_kaio_ok > 0) && (KAIO_SUPPORTED(fd))) {
+ resultp->aio_errno = 0;
+ kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
+ (umode | AIO_POLL_BIT) : umode),
+ fd, buf, bufsz, loffset, resultp);
+ if (kerr == 0)
+ return (0);
+ else if ((errno != ENOTSUP) && (errno != EBADFD))
+ return (-1);
+ if (errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(fd);
+ }
+ if (!__uaio_ok) {
+ if (__uaio_init() == -1)
+ return (-1);
+ }
+
+ aiorp = _aio_req_alloc();
+ if (aiorp == (aio_req_t *)-1) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * _aio_do_request() checks aiorp->req_op to differentiate
+ * between 32 and 64 bit access.
+ */
+ aiorp->req_op = mode;
+ aiorp->req_resultp = resultp;
+ ap = &(aiorp->req_args);
+ ap->fd = fd;
+ ap->buf = buf;
+ ap->bufsz = bufsz;
+ ap->offset = loffset;
+
+ nextworker = ((umode == AIOWRITE) ? &__nextworker_wr :
+ &__nextworker_rd);
+ _aio_lock();
+ if (_aio_hash_insert(resultp, aiorp)) {
+ _aio_req_free(aiorp);
+ _aio_unlock();
+ errno = EINVAL;
+ return (-1);
+ } else {
+ _aio_unlock();
+
+ /*
+ * _aio_req_add() only needs the difference between READ and
+ * WRITE to choose the right worker queue.
+ */
+ _aio_req_add(aiorp, nextworker, umode);
+ return (0);
+ }
+}
+
+int
+aiocancel(aio_result_t *resultp)
+{
+ aio_req_t *aiorp;
+ struct aio_worker *aiowp;
+ int done = 0, canceled = 0;
+
+ if (!__uaio_ok) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ _aio_lock();
+ aiorp = _aio_hash_find(resultp);
+ if (aiorp == NULL) {
+ if (_aio_outstand_cnt == _aio_req_done_cnt)
+ errno = EINVAL;
+ else
+ errno = EACCES;
+
+ _aio_unlock();
+ return (-1);
+ } else {
+ aiowp = aiorp->req_worker;
+ (void) mutex_lock(&aiowp->work_qlock1);
+ (void) _aio_cancel_req(aiowp, aiorp, &canceled, &done);
+ (void) mutex_unlock(&aiowp->work_qlock1);
+
+ if (canceled) {
+ _aio_unlock();
+ return (0);
+ }
+
+ if (_aio_outstand_cnt == 0) {
+ _aio_unlock();
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if (_aio_outstand_cnt == _aio_req_done_cnt) {
+ errno = EINVAL;
+ } else {
+ errno = EACCES;
+ }
+
+ _aio_unlock();
+ return (-1);
+
+ }
+}
+
+/*
+ * This must be asynch safe
+ */
+aio_result_t *
+aiowait(struct timeval *uwait)
+{
+ aio_result_t *uresultp, *kresultp, *resultp;
+ int dontblock;
+ int timedwait = 0;
+ int kaio_errno = 0;
+ struct timeval twait, *wait = NULL;
+ hrtime_t hrtend;
+ hrtime_t hres;
+
+ if (uwait) {
+ /*
+ * Check for valid specified wait time. If they are invalid
+ * fail the call right away.
+ */
+ if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
+ uwait->tv_usec >= MICROSEC) {
+ errno = EINVAL;
+ return ((aio_result_t *)-1);
+ }
+
+ if ((uwait->tv_sec > 0) || (uwait->tv_usec > 0)) {
+ hrtend = gethrtime() +
+ (hrtime_t)uwait->tv_sec * NANOSEC +
+ (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
+ twait = *uwait;
+ wait = &twait;
+ timedwait++;
+ } else {
+ /* polling */
+ kresultp = (aio_result_t *)_kaio(AIOWAIT,
+ (struct timeval *)-1, 1);
+ if (kresultp != (aio_result_t *)-1 &&
+ kresultp != NULL && kresultp != (aio_result_t *)1)
+ return (kresultp);
+ _aio_lock();
+ uresultp = _aio_req_done();
+ if (uresultp != NULL && uresultp !=
+ (aio_result_t *)-1) {
+ _aio_unlock();
+ return (uresultp);
+ }
+ _aio_unlock();
+ if (uresultp == (aio_result_t *)-1 &&
+ kresultp == (aio_result_t *)-1) {
+ errno = EINVAL;
+ return ((aio_result_t *)-1);
+ } else
+ return (NULL);
+ }
+ }
+
+ for (;;) {
+ _aio_lock();
+ uresultp = _aio_req_done();
+ if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
+ _aio_unlock();
+ resultp = uresultp;
+ break;
+ }
+ _aiowait_flag++;
+ _aio_unlock();
+ dontblock = (uresultp == (aio_result_t *)-1);
+ kresultp = (aio_result_t *)_kaio(AIOWAIT, wait, dontblock);
+ kaio_errno = errno;
+ _aio_lock();
+ _aiowait_flag--;
+ _aio_unlock();
+ if (kresultp == (aio_result_t *)1) {
+ /* aiowait() awakened by an aionotify() */
+ continue;
+ } else if (kresultp != NULL && kresultp != (aio_result_t *)-1) {
+ resultp = kresultp;
+ break;
+ } else if (kresultp == (aio_result_t *)-1 && kaio_errno ==
+ EINVAL && uresultp == (aio_result_t *)-1) {
+ errno = kaio_errno;
+ resultp = (aio_result_t *)-1;
+ break;
+ } else if (kresultp == (aio_result_t *)-1 &&
+ kaio_errno == EINTR) {
+ errno = kaio_errno;
+ resultp = (aio_result_t *)-1;
+ break;
+ } else if (timedwait) {
+ hres = hrtend - gethrtime();
+ if (hres <= 0) {
+ /* time is up. Return */
+ resultp = NULL;
+ break;
+ } else {
+ /*
+ * some time left. Round up the remaining time
+ * in nanoseconds to microsec. Retry the call.
+ */
+ hres += (NANOSEC / MICROSEC)-1;
+ wait->tv_sec = hres / NANOSEC;
+ wait->tv_usec =
+ (hres % NANOSEC) / (NANOSEC / MICROSEC);
+ }
+ } else {
+ ASSERT((kresultp == NULL && uresultp == NULL));
+ resultp = NULL;
+ continue;
+ }
+ }
+ return (resultp);
+}
+
+/*
+ * _aio_get_timedelta calculates the remaining time and stores the result
+ * into struct timespec *wait.
+ */
+
+int
+_aio_get_timedelta(struct timespec *end, struct timespec *wait)
+{
+
+ int ret = 0;
+ struct timeval cur;
+ struct timespec curtime;
+
+ (void) gettimeofday(&cur, NULL);
+ curtime.tv_sec = cur.tv_sec;
+ curtime.tv_nsec = cur.tv_usec * 1000; /* convert us to ns */
+
+ if (end->tv_sec >= curtime.tv_sec) {
+ wait->tv_sec = end->tv_sec - curtime.tv_sec;
+ if (end->tv_nsec >= curtime.tv_nsec) {
+ wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
+ if (wait->tv_sec == 0 && wait->tv_nsec == 0)
+ ret = -1; /* timer expired */
+ } else {
+ if (end->tv_sec > curtime.tv_sec) {
+ wait->tv_sec -= 1;
+ wait->tv_nsec = NANOSEC -
+ (curtime.tv_nsec - end->tv_nsec);
+ } else {
+ ret = -1; /* timer expired */
+ }
+ }
+ } else {
+ ret = -1;
+ }
+ return (ret);
+}
+
+/*
+ * If closing by file descriptor: we will simply cancel all the outstanding
+ * aio`s and return. Those aio's in question will have either noticed the
+ * cancellation notice before, during, or after initiating io.
+ */
+int
+aiocancel_all(int fd)
+{
+ aio_req_t *aiorp;
+ aio_req_t **aiorpp;
+ struct aio_worker *first, *next;
+ int canceled = 0;
+ int done = 0;
+ int cancelall = 0;
+
+ if (_aio_outstand_cnt == 0)
+ return (AIO_ALLDONE);
+
+ _aio_lock();
+ /*
+ * cancel read requests from the read worker's queue.
+ */
+ first = __nextworker_rd;
+ next = first;
+ do {
+ _aio_cancel_work(next, fd, &canceled, &done);
+ } while ((next = next->work_forw) != first);
+
+ /*
+ * cancel write requests from the write workers queue.
+ */
+
+ first = __nextworker_wr;
+ next = first;
+ do {
+ _aio_cancel_work(next, fd, &canceled, &done);
+ } while ((next = next->work_forw) != first);
+
+ /*
+ * finally, check if there are requests on the done queue that
+ * should be canceled.
+ */
+ if (fd < 0)
+ cancelall = 1;
+ aiorpp = &_aio_done_tail;
+ while ((aiorp = *aiorpp) != NULL) {
+ if (cancelall || aiorp->req_args.fd == fd) {
+ *aiorpp = aiorp->req_next;
+ _aio_donecnt--;
+ (void) _aio_hash_del(aiorp->req_resultp);
+ _aio_req_free(aiorp);
+ } else
+ aiorpp = &aiorp->req_next;
+ }
+ if (cancelall) {
+ ASSERT(_aio_donecnt == 0);
+ _aio_done_head = NULL;
+ }
+ _aio_unlock();
+
+ if (canceled && done == 0)
+ return (AIO_CANCELED);
+ else if (done && canceled == 0)
+ return (AIO_ALLDONE);
+ else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
+ return ((int)_kaio(AIOCANCEL, fd, NULL));
+ return (AIO_NOTCANCELED);
+}
+
+/*
+ * cancel requests from a given work queue. if the file descriptor
+ * parameter, fd, is non NULL, then only cancel those requests in
+ * this queue that are to this file descriptor. if the "fd"
+ * parameter is -1, then cancel all requests.
+ */
+static void
+_aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
+{
+ aio_req_t *aiorp;
+
+ (void) mutex_lock(&aiowp->work_qlock1);
+ /*
+ * cancel queued requests first.
+ */
+ aiorp = aiowp->work_tail1;
+ while (aiorp != NULL) {
+ if (fd < 0 || aiorp->req_args.fd == fd) {
+ if (_aio_cancel_req(aiowp, aiorp, canceled, done)) {
+ /*
+ * callers locks were dropped. aiorp is
+ * invalid, start traversing the list from
+ * the beginning.
+ */
+ aiorp = aiowp->work_tail1;
+ continue;
+ }
+ }
+ aiorp = aiorp->req_next;
+ }
+ /*
+ * since the queued requests have been canceled, there can
+ * only be one inprogress request that shoule be canceled.
+ */
+ if ((aiorp = aiowp->work_req) != NULL) {
+ if (fd < 0 || aiorp->req_args.fd == fd) {
+ (void) _aio_cancel_req(aiowp, aiorp, canceled, done);
+ aiowp->work_req = NULL;
+ }
+ }
+ (void) mutex_unlock(&aiowp->work_qlock1);
+}
+
+/*
+ * cancel a request. return 1 if the callers locks were temporarily
+ * dropped, otherwise return 0.
+ */
+int
+_aio_cancel_req(aio_worker_t *aiowp, aio_req_t *aiorp, int *canceled, int *done)
+{
+ int ostate;
+ int rwflg = 1;
+ int siqueued;
+ int canned;
+
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
+ ostate = aiorp->req_state;
+ if (ostate == AIO_REQ_CANCELED) {
+ return (0);
+ }
+ if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
+ (*done)++;
+ return (0);
+ }
+ if (ostate == AIO_REQ_FREE)
+ return (0);
+ if (aiorp->req_op == AIOFSYNC) {
+ canned = aiorp->lio_head->lio_canned;
+ aiorp->lio_head->lio_canned = 1;
+ rwflg = 0;
+ if (canned)
+ return (0);
+ }
+ aiorp->req_state = AIO_REQ_CANCELED;
+ _aio_req_del(aiowp, aiorp, ostate);
+ if (ostate == AIO_REQ_INPROGRESS)
+ (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ (void) _aio_hash_del(aiorp->req_resultp);
+ (void) mutex_unlock(&__aio_mutex);
+ siqueued = _aiodone(aiorp, aiorp->lio_head, rwflg, -1, ECANCELED);
+ (void) mutex_lock(&__aio_mutex);
+ (void) mutex_lock(&aiowp->work_qlock1);
+ _lio_remove(aiorp->lio_head);
+ if (!siqueued)
+ _aio_req_free(aiorp);
+ (*canceled)++;
+ return (1);
+}
+
+/*
+ * This is the worker's main routine.
+ * The task of this function is to execute all queued requests;
+ * once the last pending request is executed this function will block
+ * in _aio_idle(). A new incoming request must wakeup this thread to
+ * restart the work.
+ * Every worker has an own work queue. The queue lock is required
+ * to synchronize the addition of new requests for this worker or
+ * cancellation of pending/running requests.
+ *
+ * Cancellation scenarios:
+ * The cancellation of a request is being done asynchronously using
+ * _aio_cancel_req() from another thread context.
+ * A queued request can be cancelled in different manners :
+ * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
+ * - lock the queue -> remove the request -> unlock the queue
+ * - this function/thread does not detect this cancellation process
+ * b) request is in progress (AIO_REQ_INPROGRESS) :
+ * - this function first allow the cancellation of the running
+ * request with the flag "work_cancel_flg=1"
+ * see _aio_req_get() -> _aio_cancel_on()
+ * During this phase, it is allowed to interrupt the worker
+ * thread running the request (this thread) using the SIGAIOCANCEL
+ * signal.
+ * Once this thread returns from the kernel (because the request
+ * is just done), then it must disable a possible cancellation
+ * and proceed to finish the request. To disable the cancellation
+ * this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
+ * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
+ * same procedure as in a)
+ *
+ * To b)
+ * This thread uses sigsetjmp() to define the position in the code, where
+ * it wish to continue working in the case that a SIGAIOCANCEL signal
+ * is detected.
+ * Normally this thread should get the cancellation signal during the
+ * kernel phase (reading or writing). In that case the signal handler
+ * aiosigcancelhndlr() is activated using the worker thread context,
+ * which again will use the siglongjmp() function to break the standard
+ * code flow and jump to the "sigsetjmp" position, provided that
+ * "work_cancel_flg" is set to "1".
+ * Because the "work_cancel_flg" is only manipulated by this worker
+ * thread and it can only run on one CPU at a given time, it is not
+ * necessary to protect that flag with the queue lock.
+ * Returning from the kernel (read or write system call) we must
+ * first disable the use of the SIGAIOCANCEL signal and accordingly
+ * the use of the siglongjmp() function to prevent a possible deadlock:
+ * - It can happens that this worker thread returns from the kernel and
+ * blocks in "work_qlock1",
+ * - then a second thread cancels the apparently "in progress" request
+ * and sends the SIGAIOCANCEL signal to the worker thread,
+ * - the worker thread gets assigned the "work_qlock1" and will returns
+ * from the kernel,
+ * - the kernel detects the pending signal and activates the signal
+ * handler instead,
+ * - if the "work_cancel_flg" is still set then the signal handler
+ * should use siglongjmp() to cancel the "in progress" request and
+ * it would try to acquire the same work_qlock1 in _aio_req_get()
+ * for a second time => deadlock.
+ * To avoid that situation we disable the cancellation of the request
+ * in progress BEFORE we try to acquire the work_qlock1.
+ * In that case the signal handler will not call siglongjmp() and the
+ * worker thread will continue running the standard code flow.
+ * Then this thread must check the AIO_REQ_CANCELED flag to emulate
+ * an eventually required siglongjmp() freeing the work_qlock1 and
+ * avoiding a deadlock.
+ */
+void *
+_aio_do_request(void *arglist)
+{
+ aio_worker_t *aiowp = (aio_worker_t *)arglist;
+ struct aio_args *arg;
+ aio_req_t *aiorp; /* current AIO request */
+ int ostate;
+ ssize_t retval;
+ int rwflg;
+
+ aiowp->work_tid = thr_self();
+ if (thr_setspecific(_aio_key, aiowp) != 0)
+ _aiopanic("_aio_do_request, thr_setspecific()\n");
+
+cancelit:
+ if (sigsetjmp(aiowp->work_jmp_buf, 0)) {
+ _sigprocmask(SIG_SETMASK, &_worker_set, NULL);
+ goto cancelit;
+ }
+
+ for (;;) {
+ int err = 0;
+
+ /*
+ * Put completed requests on aio_done_list. This has
+ * to be done as part of the main loop to ensure that
+ * we don't artificially starve any aiowait'ers.
+ */
+ if (aiowp->work_done1)
+ _aio_work_done(aiowp);
+
+ while ((aiorp = _aio_req_get(aiowp)) == NULL) {
+ _aio_idle(aiowp);
+ }
+#ifdef DEBUG
+ _donecnt[aiowp->work_tid]++;
+#endif
+ arg = &aiorp->req_args;
+
+ err = 0;
+ rwflg = 1;
+ switch (aiorp->req_op) {
+ case AIOREAD:
+ retval = pread(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = read(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ err = errno;
+ } else {
+ err = errno;
+ }
+ }
+ break;
+ case AIOWRITE:
+ retval = pwrite(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = write(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ err = errno;
+ } else {
+ err = errno;
+ }
+ }
+ break;
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+ case AIOAREAD64:
+ retval = pread64(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = read(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ err = errno;
+ } else {
+ err = errno;
+ }
+ }
+ break;
+ case AIOAWRITE64:
+ retval = pwrite64(arg->fd, arg->buf,
+ arg->bufsz, arg->offset);
+ if (retval == -1) {
+ if (errno == ESPIPE) {
+ retval = write(arg->fd,
+ arg->buf, arg->bufsz);
+ if (retval == -1)
+ err = errno;
+ } else {
+ err = errno;
+ }
+ }
+ break;
+#endif /* (_LARGEFILE64_SOURCE) && !defined(_LP64) */
+ case AIOFSYNC:
+ if (_aio_fsync_del(aiorp, aiorp->lio_head))
+ continue;
+ (void) mutex_lock(&aiowp->work_qlock1);
+ ostate = aiorp->req_state;
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ if (ostate == AIO_REQ_CANCELED) {
+ (void) mutex_lock(&aiorp->req_lock);
+ aiorp->req_canned = 1;
+ (void) cond_broadcast(
+ &aiorp->req_cancv);
+ (void) mutex_unlock(&aiorp->req_lock);
+ continue;
+ }
+ rwflg = 0;
+ /*
+ * all writes for this fsync request are
+ * now acknowledged. now, make these writes
+ * visible.
+ */
+ if (arg->offset == O_SYNC)
+ retval = __fdsync(arg->fd, FSYNC);
+ else
+ retval = __fdsync(arg->fd, FDSYNC);
+ if (retval == -1)
+ err = errno;
+ break;
+ default:
+ rwflg = 0;
+ _aiopanic("_aio_do_request, bad op\n");
+ }
+
+ /*
+ * Disable the cancellation of the "in progress"
+ * request before trying to acquire the lock of the queue.
+ *
+ * It is not necessary to protect "work_cancel_flg" with
+ * work_qlock1, because this thread can only run on one
+ * CPU at a time.
+ */
+
+ _aio_cancel_off(aiowp);
+ (void) mutex_lock(&aiowp->work_qlock1);
+
+ /*
+ * if we return here either
+ * - we got the lock and can close the transaction
+ * as usual or
+ * - the current transaction was cancelled, but siglongjmp
+ * was not executed
+ */
+
+ if (aiorp->req_state == AIO_REQ_CANCELED) {
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ continue;
+ }
+
+ aiorp->req_state = AIO_REQ_DONE;
+ _aio_req_done_cnt++;
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ (void) _aiodone(aiorp, aiorp->lio_head, rwflg, retval, err);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * posix supports signal notification for completed aio requests.
+ * when aio_do_requests() notices that an aio requests should send
+ * a signal, the aio request is moved to the signal notification
+ * queue. this routine drains this queue, and guarentees that the
+ * signal notification is sent.
+ */
+void *
+_aio_send_sigev(void *arg)
+{
+ aio_req_t *rp;
+ aio_worker_t *aiowp = (aio_worker_t *)arg;
+
+ aiowp->work_tid = thr_self();
+ if (thr_setspecific(_aio_key, aiowp) != 0)
+ _aiopanic("_aio_send_sigev, thr_setspecific()\n");
+
+ for (;;) {
+ while ((rp = _aio_req_get(aiowp)) == NULL) {
+ _aio_idle(aiowp);
+ }
+ if (rp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ while (__sigqueue(__pid, rp->aio_sigevent.sigev_signo,
+ rp->aio_sigevent.sigev_value.sival_ptr,
+ SI_ASYNCIO) == -1)
+ thr_yield();
+ }
+ if (rp->lio_signo) {
+ while (__sigqueue(__pid, rp->lio_signo,
+ rp->lio_sigval.sival_ptr, SI_ASYNCIO) == -1)
+ thr_yield();
+ }
+ _aio_lock();
+ _lio_remove(rp->lio_head);
+ _aio_req_free(rp);
+ _aio_unlock();
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * do the completion semantic for a request that was either canceled
+ * by _aio_cancel_req(), or was completed by _aio_do_request(). return
+ * the value 1 when a sigevent was queued, otherwise return 0.
+ */
+
+static int
+_aiodone(aio_req_t *rp, aio_lio_t *head, int rwflg, ssize_t retval, int err)
+{
+ volatile aio_result_t *resultp;
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+ aiocb64_t *aiop64;
+#endif
+ int sigev;
+
+ _aio_lock();
+
+ if (POSIX_AIO(rp)) {
+ void *user;
+ int port;
+ int error;
+
+ if (rp->aio_sigevent.sigev_notify == SIGEV_PORT) {
+ resultp = rp->req_resultp;
+ resultp->aio_return = retval;
+ resultp->aio_errno = err;
+
+ if (err == ECANCELED || rwflg)
+ _aio_outstand_cnt--;
+
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+ if (rp->req_op == AIOAREAD64 ||
+ rp->req_op == AIOAWRITE64) {
+ aiop64 = (void *)rp->req_iocb;
+ aiop64->aio_state = USERAIO_DONE;
+ } else
+#endif
+ rp->req_iocb->aio_state = USERAIO_DONE;
+
+ port = rp->aio_sigevent.sigev_signo;
+ user = rp->aio_sigevent.sigev_value.sival_ptr;
+ error = _port_dispatch(port, 0, PORT_SOURCE_AIO, 0,
+ (uintptr_t)rp->req_iocb, user);
+ if (error == 0) {
+ (void) _aio_hash_del(rp->req_resultp);
+ _aio_req_free(rp);
+ _aio_unlock();
+ return (1);
+ }
+ /*
+ * Can not submit the I/O completion to the port,
+ * set status of transaction to NONE
+ */
+ rp->aio_sigevent.sigev_notify = SIGEV_NONE;
+ if (err == ECANCELED || rwflg)
+ _aio_outstand_cnt++;
+ }
+
+ sigev = (rp->aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ (head && head->lio_signo));
+ if (sigev)
+ (void) _aio_hash_del(rp->req_resultp);
+
+ resultp = rp->req_resultp;
+ /*
+ * resultp is declared "volatile" (above) to avoid
+ * optimization by compiler ie. switching order which could
+ * lead aio_return getting checked by aio_error() following
+ * a particular aio_errno value (aio_return would not have been
+ * set yet)
+ */
+ resultp->aio_return = retval;
+ resultp->aio_errno = err;
+
+ if (err == ECANCELED) {
+ _aio_outstand_cnt--;
+ } else {
+ if (rwflg) {
+ if (!sigev)
+ _aio_enq_doneq(rp);
+ _aio_outstand_cnt--;
+ }
+
+ }
+
+ /*
+ * __aio_waitn() sets AIO_IO_WAITING to notify _aiodone() that
+ * it is waiting for completed I/Os. The number of required
+ * completed I/Os is stored into "_aio_waitncnt".
+ * aio_waitn() is woken up when
+ * - there are no further outstanding I/Os
+ * (_aio_outstand_cnt == 0) or
+ * - the expected number of I/Os has completed.
+ * Only one __aio_waitn() function waits for completed I/Os at
+ * a time.
+ *
+ * __aio_suspend() increments "_aio_suscv_cnt" to notify
+ * _aiodone() that at least one __aio_suspend() call is
+ * waiting for completed I/Os.
+ * There could be more than one __aio_suspend() function
+ * waiting for completed I/Os. Because every function should
+ * be waiting for different I/Os, _aiodone() has to wake up all
+ * __aio_suspend() functions each time.
+ * Every __aio_suspend() function will compare the recently
+ * completed I/O with its own list.
+ */
+ if (_aio_flags & AIO_IO_WAITING) {
+ if (_aio_waitncnt > 0)
+ _aio_waitncnt--;
+ if (_aio_outstand_cnt == 0 || _aio_waitncnt == 0 ||
+ _aio_suscv_cnt > 0)
+ (void) cond_broadcast(&_aio_iowait_cv);
+ } else {
+ /* Wake up waiting aio_suspend calls */
+ if (_aio_suscv_cnt > 0)
+ (void) cond_broadcast(&_aio_iowait_cv);
+ }
+
+ _aio_unlock();
+
+ /*
+ * __aio_waitn() sets AIO_WAIT_INPROGRESS and
+ * __aio_suspend() increments "_aio_kernel_suspend"
+ * when they are waiting in the kernel for completed I/Os.
+ *
+ * _kaio(AIONOTIFY) awakes the corresponding function
+ * in the kernel; then the corresponding __aio_waitn() or
+ * __aio_suspend() function could reap the recently
+ * completed I/Os (_aiodone()).
+ */
+ if (err != ECANCELED) {
+ if (_aio_flags & AIO_WAIT_INPROGRESS ||
+ _aio_kernel_suspend > 0) {
+ (void) _kaio(AIONOTIFY);
+ }
+ }
+
+ rp->lio_signo = 0;
+ rp->lio_sigval.sival_int = 0;
+ if (head) {
+ /*
+ * If all the lio requests have completed,
+ * signal the waiting process
+ */
+ (void) mutex_lock(&head->lio_mutex);
+ if (--head->lio_refcnt == 0) {
+ if (head->lio_mode == LIO_WAIT)
+ (void) cond_signal(&head->lio_cond_cv);
+ else {
+ rp->lio_signo = head->lio_signo;
+ rp->lio_sigval = head->lio_sigval;
+ }
+ }
+ (void) mutex_unlock(&head->lio_mutex);
+ }
+ if (sigev) {
+ _aio_req_add(rp, &__workers_si, AIOSIGEV);
+ return (1);
+ }
+ } else {
+ /* Solaris I/O */
+ if (err == ECANCELED)
+ _aio_outstand_cnt--;
+
+ _aio_unlock();
+
+ resultp = rp->req_resultp;
+ resultp->aio_return = retval;
+ resultp->aio_errno = err;
+ }
+ return (0);
+}
+
+/*
+ * delete fsync requests from list head until there is
+ * only one left. return 0 when there is only one, otherwise
+ * return a non-zero value.
+ */
+static int
+_aio_fsync_del(aio_req_t *rp, aio_lio_t *head)
+{
+ int refcnt;
+
+ (void) mutex_lock(&head->lio_mutex);
+ if (head->lio_refcnt > 1 || head->lio_mode == LIO_DESTROY ||
+ head->lio_canned) {
+ refcnt = --head->lio_refcnt;
+ if (refcnt || head->lio_canned) {
+ head->lio_nent--;
+ (void) mutex_unlock(&head->lio_mutex);
+ (void) mutex_lock(&__aio_mutex);
+ _aio_req_free(rp);
+ (void) mutex_unlock(&__aio_mutex);
+ if (head->lio_canned) {
+ ASSERT(refcnt >= 0);
+ return (0);
+ }
+ return (1);
+ }
+ ASSERT(head->lio_mode == LIO_DESTROY);
+ ASSERT(head->lio_nent == 1 && head->lio_refcnt == 0);
+ (void) mutex_unlock(&head->lio_mutex);
+ _aio_remove(rp);
+ return (0);
+ }
+ ASSERT(head->lio_refcnt == head->lio_nent);
+ (void) mutex_unlock(&head->lio_mutex);
+ return (0);
+}
+
+/*
+ * worker is set idle when its work queue is empty.
+ * The worker checks again that it has no more work and then
+ * goes to sleep waiting for more work.
+ */
+void
+_aio_idle(aio_worker_t *aiowp)
+{
+ (void) mutex_lock(&aiowp->work_lock);
+ if (aiowp->work_cnt1 == 0) {
+#ifdef DEBUG
+ _idlecnt[aiowp->work_tid]++;
+#endif
+ aiowp->work_idleflg = 1;
+ (void) cond_wait(&aiowp->work_idle_cv, &aiowp->work_lock);
+ /*
+ * idle flag is cleared before worker is awakened
+ * by aio_req_add().
+ */
+ }
+ (void) mutex_unlock(&aiowp->work_lock);
+}
+
+/*
+ * A worker's completed AIO requests are placed onto a global
+ * done queue. The application is only sent a SIGIO signal if
+ * the process has a handler enabled and it is not waiting via
+ * aiowait().
+ */
+static void
+_aio_work_done(struct aio_worker *aiowp)
+{
+ struct aio_req *done_req = NULL;
+
+ (void) mutex_lock(&aiowp->work_qlock1);
+ done_req = aiowp->work_prev1;
+ done_req->req_next = NULL;
+ aiowp->work_done1 = 0;
+ aiowp->work_tail1 = aiowp->work_next1;
+ if (aiowp->work_tail1 == NULL)
+ aiowp->work_head1 = NULL;
+ aiowp->work_prev1 = NULL;
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ (void) mutex_lock(&__aio_mutex);
+ _aio_donecnt++;
+ _aio_outstand_cnt--;
+ _aio_req_done_cnt--;
+ ASSERT(_aio_donecnt > 0 && _aio_outstand_cnt >= 0);
+ ASSERT(done_req != NULL);
+
+ if (_aio_done_tail == NULL) {
+ _aio_done_head = _aio_done_tail = done_req;
+ } else {
+ _aio_done_head->req_next = done_req;
+ _aio_done_head = done_req;
+ }
+
+ if (_aiowait_flag) {
+ (void) mutex_unlock(&__aio_mutex);
+ (void) _kaio(AIONOTIFY);
+ } else {
+ (void) mutex_unlock(&__aio_mutex);
+ if (_sigio_enabled) {
+ (void) kill(__pid, SIGIO);
+ }
+ }
+}
+
+/*
+ * the done queue consists of AIO requests that are in either the
+ * AIO_REQ_DONE or AIO_REQ_CANCELED state. requests that were cancelled
+ * are discarded. if the done queue is empty then NULL is returned.
+ * otherwise the address of a done aio_result_t is returned.
+ */
+struct aio_result_t *
+_aio_req_done(void)
+{
+ struct aio_req *next;
+ aio_result_t *resultp;
+
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+
+ if ((next = _aio_done_tail) != NULL) {
+ _aio_done_tail = next->req_next;
+ ASSERT(_aio_donecnt > 0);
+ _aio_donecnt--;
+ (void) _aio_hash_del(next->req_resultp);
+ resultp = next->req_resultp;
+ ASSERT(next->req_state == AIO_REQ_DONE);
+ _aio_req_free(next);
+ return (resultp);
+ }
+ /* is queue empty? */
+ if (next == NULL && _aio_outstand_cnt == 0) {
+ return ((aio_result_t *)-1);
+ }
+ return (NULL);
+}
+
+/*
+ * add an AIO request onto the next work queue. a circular list of
+ * workers is used to choose the next worker. each worker has two
+ * work queues. if the lock for the first queue is busy then the
+ * request is placed on the second queue. the request is always
+ * placed on one of the two queues depending on which one is locked.
+ */
+void
+_aio_req_add(aio_req_t *aiorp, aio_worker_t **nextworker, int mode)
+{
+ struct aio_worker *aiowp;
+ struct aio_worker *first;
+ int clogged = 0;
+ int found = 0;
+ int load_bal_flg;
+ int idleflg;
+ int qactive;
+
+ aiorp->req_next = NULL;
+ ASSERT(*nextworker != NULL);
+ aiowp = *nextworker;
+ /*
+ * try to acquire the next worker's work queue. if it is locked,
+ * then search the list of workers until a queue is found unlocked,
+ * or until the list is completely traversed at which point another
+ * worker will be created.
+ */
+ first = aiowp;
+ _aio_lock();
+ __sigio_maskedcnt++; /* disable SIGIO */
+ if (mode == AIOREAD || mode == AIOWRITE) {
+ _aio_outstand_cnt++;
+ load_bal_flg = 1;
+ }
+ _aio_unlock();
+ switch (mode) {
+ case AIOREAD:
+ /* try to find an idle worker. */
+ do {
+ if (mutex_trylock(&aiowp->work_qlock1) == 0) {
+ if (aiowp->work_idleflg) {
+ found = 1;
+ break;
+ }
+ (void) mutex_unlock(
+ &aiowp->work_qlock1);
+ }
+ } while ((aiowp = aiowp->work_forw) != first);
+ if (found)
+ break;
+ /*FALLTHROUGH*/
+ case AIOWRITE:
+ while (mutex_trylock(&aiowp->work_qlock1)) {
+#ifdef DEBUG
+ _qlocked++;
+#endif
+ if (((aiowp = aiowp->work_forw)) == first) {
+ clogged = 1;
+ break;
+ }
+ }
+ /*
+ * create more workers when the workers appear
+ * overloaded. either all the workers are busy
+ * draining their queues, no worker's queue lock
+ * could be acquired, or the selected worker has
+ * exceeded its minimum work load, but has not
+ * exceeded the max number of workers.
+ */
+ if (clogged) {
+#ifdef DEBUG
+ _new_workers++;
+ _clogged++;
+#endif
+ if (_aio_worker_cnt < _max_workers) {
+ if (_aio_create_worker(aiorp, mode))
+ _aiopanic(
+ "_aio_req_add: clogged");
+ _aio_lock();
+ __sigio_maskedcnt--;
+ _aio_unlock();
+ return;
+ }
+
+ /*
+ * No worker available and we have created
+ * _max_workers, keep going through the
+ * list until we get a lock
+ */
+ while (mutex_trylock(&aiowp->work_qlock1)) {
+ /*
+ * give someone else a chance
+ */
+ thr_yield();
+ aiowp = aiowp->work_forw;
+ }
+
+ }
+ ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
+ aiowp->work_minload1++;
+ if (_aio_worker_cnt < _max_workers &&
+ aiowp->work_minload1 > _minworkload) {
+ aiowp->work_minload1 = 0;
+ (void) mutex_unlock(&aiowp->work_qlock1);
+#ifdef DEBUG
+ _qfullcnt[aiowp->work_tid]++;
+ _new_workers++;
+ _newworker[aiowp->work_tid]++;
+ _avedone = _aio_submitcnt2/_new_workers;
+#endif
+ (void) mutex_lock(&__aio_mutex);
+ *nextworker = aiowp->work_forw;
+ (void) mutex_unlock(&__aio_mutex);
+ if (_aio_create_worker(aiorp, mode))
+ _aiopanic("aio_req_add: add worker");
+ _aio_lock();
+ __sigio_maskedcnt--; /* enable signals again */
+ _aio_unlock(); /* send evt. SIGIO signal */
+ return;
+ }
+ break;
+ case AIOFSYNC:
+ aiorp->req_op = mode;
+ /*FALLTHROUGH*/
+ case AIOSIGEV:
+ load_bal_flg = 0;
+ (void) mutex_lock(&aiowp->work_qlock1);
+ break;
+ }
+ /*
+ * Put request onto worker's work queue.
+ */
+ if (aiowp->work_tail1 == NULL) {
+ ASSERT(aiowp->work_cnt1 == 0);
+ aiowp->work_tail1 = aiorp;
+ aiowp->work_next1 = aiorp;
+ } else {
+ aiowp->work_head1->req_next = aiorp;
+ if (aiowp->work_next1 == NULL)
+ aiowp->work_next1 = aiorp;
+ }
+ aiorp->req_state = AIO_REQ_QUEUED;
+ aiorp->req_worker = aiowp;
+ aiowp->work_head1 = aiorp;
+ qactive = aiowp->work_cnt1++;
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ if (load_bal_flg) {
+ _aio_lock();
+ *nextworker = aiowp->work_forw;
+ _aio_unlock();
+ }
+ /*
+ * Awaken worker if it is not currently active.
+ */
+ if (!qactive) {
+ (void) mutex_lock(&aiowp->work_lock);
+ idleflg = aiowp->work_idleflg;
+ aiowp->work_idleflg = 0;
+ (void) mutex_unlock(&aiowp->work_lock);
+ if (idleflg)
+ (void) cond_signal(&aiowp->work_idle_cv);
+ }
+ _aio_lock();
+ __sigio_maskedcnt--; /* enable signals again */
+ _aio_unlock(); /* send SIGIO signal if pending */
+}
+
+/*
+ * get an AIO request for a specified worker. each worker has
+ * two work queues. find the first one that is not empty and
+ * remove this request from the queue and return it back to the
+ * caller. if both queues are empty, then return a NULL.
+ */
+aio_req_t *
+_aio_req_get(aio_worker_t *aiowp)
+{
+ aio_req_t *next;
+ int mode;
+
+ (void) mutex_lock(&aiowp->work_qlock1);
+ if ((next = aiowp->work_next1) != NULL) {
+ /*
+ * remove a POSIX request from the queue; the
+ * request queue is a singularly linked list
+ * with a previous pointer. The request is removed
+ * by updating the previous pointer.
+ *
+ * non-posix requests are left on the queue to
+ * eventually be placed on the done queue.
+ */
+
+ if (next->req_type == AIO_POSIX_REQ) {
+ if (aiowp->work_prev1 == NULL) {
+ aiowp->work_tail1 = next->req_next;
+ if (aiowp->work_tail1 == NULL)
+ aiowp->work_head1 = NULL;
+ } else {
+ aiowp->work_prev1->req_next = next->req_next;
+ if (aiowp->work_head1 == next)
+ aiowp->work_head1 = next->req_next;
+ }
+
+ } else {
+ aiowp->work_prev1 = next;
+ ASSERT(aiowp->work_done1 >= 0);
+ aiowp->work_done1++;
+ }
+ ASSERT(next != next->req_next);
+ aiowp->work_next1 = next->req_next;
+ ASSERT(aiowp->work_cnt1 >= 1);
+ aiowp->work_cnt1--;
+ mode = next->req_op;
+ if (mode == AIOWRITE || mode == AIOREAD || mode == AIOAREAD64 ||
+ mode == AIOAWRITE64)
+ aiowp->work_minload1--;
+#ifdef DEBUG
+ _firstqcnt[aiowp->work_tid]++;
+#endif
+ next->req_state = AIO_REQ_INPROGRESS;
+ _aio_cancel_on(aiowp);
+ }
+ aiowp->work_req = next;
+ ASSERT(next != NULL || (next == NULL && aiowp->work_cnt1 == 0));
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ return (next);
+}
+
+static void
+_aio_req_del(aio_worker_t *aiowp, aio_req_t *rp, int ostate)
+{
+ aio_req_t **last, *lastrp, *next;
+
+ ASSERT(aiowp != NULL);
+ ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
+ if (POSIX_AIO(rp)) {
+ if (ostate != AIO_REQ_QUEUED)
+ return;
+ }
+ last = &aiowp->work_tail1;
+ lastrp = aiowp->work_tail1;
+ ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
+ while ((next = *last) != NULL) {
+ if (next == rp) {
+ *last = next->req_next;
+ if (aiowp->work_next1 == next)
+ aiowp->work_next1 = next->req_next;
+
+ if ((next->req_next != NULL) ||
+ (aiowp->work_done1 == 0)) {
+ if (aiowp->work_head1 == next)
+ aiowp->work_head1 = next->req_next;
+ if (aiowp->work_prev1 == next)
+ aiowp->work_prev1 = next->req_next;
+ } else {
+ if (aiowp->work_head1 == next)
+ aiowp->work_head1 = lastrp;
+ if (aiowp->work_prev1 == next)
+ aiowp->work_prev1 = lastrp;
+ }
+
+ if (ostate == AIO_REQ_QUEUED) {
+ ASSERT(aiowp->work_cnt1 >= 1);
+ aiowp->work_cnt1--;
+ } else {
+ ASSERT(ostate == AIO_REQ_INPROGRESS &&
+ !POSIX_AIO(rp));
+ aiowp->work_done1--;
+ }
+ return;
+ }
+ last = &next->req_next;
+ lastrp = next;
+ }
+ /* NOTREACHED */
+}
+
+
+static void
+_aio_enq_doneq(aio_req_t *reqp)
+{
+ if (_aio_doneq == NULL) {
+ _aio_doneq = reqp;
+ reqp->req_next = reqp;
+ reqp->req_prev = reqp;
+ } else {
+ reqp->req_next = _aio_doneq;
+ reqp->req_prev = _aio_doneq->req_prev;
+ reqp->req_prev->req_next = reqp;
+ _aio_doneq->req_prev = reqp;
+ }
+ reqp->req_state = AIO_REQ_DONEQ;
+ _aio_doneq_cnt++;
+}
+
+/*
+ * caller owns the _aio_mutex
+ */
+
+aio_req_t *
+_aio_req_remove(aio_req_t *reqp)
+{
+ aio_req_t *head;
+
+ if (reqp && reqp->req_state != AIO_REQ_DONEQ)
+ return (NULL);
+
+ if (reqp) {
+ /* request in done queue */
+ if (reqp->req_next == reqp) {
+ /* only one request on queue */
+ _aio_doneq = NULL;
+ } else {
+ reqp->req_next->req_prev = reqp->req_prev;
+ reqp->req_prev->req_next = reqp->req_next;
+ if (reqp == _aio_doneq)
+ _aio_doneq = reqp->req_next;
+ }
+ _aio_doneq_cnt--;
+ return (reqp);
+ }
+
+ if (_aio_doneq) {
+ head = _aio_doneq;
+ if (head == head->req_next) {
+ /* only one request on queue */
+ _aio_doneq = NULL;
+ } else {
+ head->req_prev->req_next = head->req_next;
+ head->req_next->req_prev = head->req_prev;
+ _aio_doneq = head->req_next;
+ }
+ _aio_doneq_cnt--;
+ return (head);
+ }
+ return (NULL);
+
+}
+
+/*
+ * An AIO request is identified by an aio_result_t pointer. The AIO
+ * library maps this aio_result_t pointer to its internal representation
+ * via a hash table. This function adds an aio_result_t pointer to
+ * the hash table.
+ */
+static int
+_aio_hash_insert(aio_result_t *resultp, aio_req_t *aiorp)
+{
+ uintptr_t i;
+ aio_req_t *next, **last;
+
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ i = AIOHASH(resultp);
+ last = (_aio_hash + i);
+ while ((next = *last) != NULL) {
+ if (resultp == next->req_resultp)
+ return (-1);
+ last = &next->req_link;
+ }
+ *last = aiorp;
+ ASSERT(aiorp->req_link == NULL);
+ return (0);
+}
+
+/*
+ * remove an entry from the hash table.
+ */
+struct aio_req *
+_aio_hash_del(aio_result_t *resultp)
+{
+ struct aio_req *next, **prev;
+ uintptr_t i;
+
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ i = AIOHASH(resultp);
+ prev = (_aio_hash + i);
+ while ((next = *prev) != NULL) {
+ if (resultp == next->req_resultp) {
+ *prev = next->req_link;
+ return (next);
+ }
+ prev = &next->req_link;
+ }
+ ASSERT(next == NULL);
+ return ((struct aio_req *)NULL);
+}
+
+/*
+ * find an entry on the hash table
+ */
+struct aio_req *
+_aio_hash_find(aio_result_t *resultp)
+{
+ struct aio_req *next, **prev;
+ uintptr_t i;
+
+ /*
+ * no user AIO
+ */
+ if (_aio_hash == NULL)
+ return (NULL);
+
+ i = AIOHASH(resultp);
+ prev = (_aio_hash + i);
+ while ((next = *prev) != NULL) {
+ if (resultp == next->req_resultp) {
+ return (next);
+ }
+ prev = &next->req_link;
+ }
+ return (NULL);
+}
+
+/*
+ * Allocate and free aios. They are cached.
+ */
+aio_req_t *
+_aio_req_alloc(void)
+{
+ aio_req_t *aiorp;
+ int err;
+
+ _aio_lock();
+ while (_aio_freelist == NULL) {
+ _aio_unlock();
+ err = 0;
+ (void) mutex_lock(&__aio_cachefillock);
+ if (__aio_cachefilling)
+ (void) cond_wait(&__aio_cachefillcv,
+ &__aio_cachefillock);
+ else
+ err = _fill_aiocache(HASHSZ);
+ (void) mutex_unlock(&__aio_cachefillock);
+ if (err)
+ return ((aio_req_t *)-1);
+ _aio_lock();
+ }
+ aiorp = _aio_freelist;
+ _aio_freelist = _aio_freelist->req_link;
+ aiorp->req_type = 0;
+ aiorp->req_link = NULL;
+ aiorp->req_next = NULL;
+ aiorp->lio_head = NULL;
+ aiorp->aio_sigevent.sigev_notify = SIGEV_NONE;
+ _aio_freelist_cnt--;
+ _aio_unlock();
+ return (aiorp);
+}
+
+/*
+ * fill the aio request cache with empty aio request structures.
+ */
+int
+_fill_aiocache(int n)
+{
+ aio_req_t *next, *aiorp, *first;
+ int cnt;
+ uintptr_t ptr;
+ int i;
+
+ __aio_cachefilling = 1;
+ if ((ptr = (uintptr_t)malloc(sizeof (struct aio_req) * n)) == NULL) {
+ __aio_cachefilling = 0;
+ (void) cond_broadcast(&__aio_cachefillcv);
+ return (-1);
+ }
+ if (ptr & 0x7)
+ _aiopanic("_fill_aiocache");
+ first = (struct aio_req *)ptr;
+ next = first;
+ cnt = n - 1;
+ for (i = 0; i < cnt; i++) {
+ aiorp = next++;
+ aiorp->req_state = AIO_REQ_FREE;
+ aiorp->req_link = next;
+ (void) mutex_init(&aiorp->req_lock, USYNC_THREAD, NULL);
+ (void) cond_init(&aiorp->req_cancv, USYNC_THREAD, NULL);
+ }
+ __aio_cachefilling = 0;
+ (void) cond_broadcast(&__aio_cachefillcv);
+ next->req_state = AIO_REQ_FREE;
+ next->req_link = NULL;
+ (void) mutex_init(&next->req_lock, USYNC_THREAD, NULL);
+ (void) cond_init(&next->req_cancv, USYNC_THREAD, NULL);
+ _aio_lock();
+ _aio_freelist_cnt = n;
+ _aio_freelist = first;
+ _aio_unlock();
+ return (0);
+}
+
+/*
+ * put an aio request back onto the freelist.
+ */
+void
+_aio_req_free(aio_req_t *aiorp)
+{
+ ASSERT(MUTEX_HELD(&__aio_mutex));
+ aiorp->req_state = AIO_REQ_FREE;
+ aiorp->req_link = _aio_freelist;
+ _aio_freelist = aiorp;
+ _aio_freelist_cnt++;
+}
+
+/*
+ * global aio lock that masks SIGIO signals.
+ */
+void
+_aio_lock(void)
+{
+ __sigio_masked = 1;
+ (void) mutex_lock(&__aio_mutex);
+ __sigio_maskedcnt++;
+}
+
+/*
+ * release global aio lock. send SIGIO signal if one
+ * is pending.
+ */
+void
+_aio_unlock(void)
+{
+ if (--__sigio_maskedcnt == 0)
+ __sigio_masked = 0;
+ (void) mutex_unlock(&__aio_mutex);
+ if (__sigio_pending)
+ __aiosendsig();
+}
+
+/*
+ * AIO interface for POSIX
+ */
+int
+_aio_rw(aiocb_t *cb, aio_lio_t *lio_head, aio_worker_t **nextworker,
+ int mode, int flg, struct sigevent *sigp)
+{
+ aio_req_t *aiorp = NULL;
+ aio_args_t *ap = NULL;
+ int kerr;
+ int umode;
+
+ if (cb == NULL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /* initialize kaio */
+ if (!_kaio_ok)
+ _kaio_init();
+
+ cb->aio_state = NOCHECK;
+
+ /*
+ * If _aio_rw() is called because a list I/O
+ * kaio() failed, we dont want to repeat the
+ * system call
+ */
+
+ if (flg & AIO_KAIO) {
+ /*
+ * Try kernel aio first.
+ * If errno is ENOTSUP/EBADFD,
+ * fall back to the thread implementation.
+ */
+ if ((_kaio_ok > 0) && (KAIO_SUPPORTED(cb->aio_fildes))) {
+ cb->aio_resultp.aio_errno = EINPROGRESS;
+ cb->aio_state = CHECK;
+ kerr = (int)_kaio(mode, cb);
+ if (kerr == 0)
+ return (0);
+ else if ((errno != ENOTSUP) && (errno != EBADFD)) {
+ cb->aio_resultp.aio_errno = errno;
+ cb->aio_resultp.aio_return = -1;
+ cb->aio_state = NOCHECK;
+ return (-1);
+ }
+ if (errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(cb->aio_fildes);
+ }
+ }
+
+ cb->aio_resultp.aio_errno = EINPROGRESS;
+ cb->aio_state = USERAIO;
+
+ if (!__uaio_ok) {
+ if (__uaio_init() == -1)
+ return (-1);
+ }
+
+ aiorp = _aio_req_alloc();
+ if (aiorp == (aio_req_t *)-1) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * If an LIO request, add the list head to the
+ * aio request
+ */
+ aiorp->lio_head = lio_head;
+ aiorp->req_type = AIO_POSIX_REQ;
+ umode = ((mode == AIOFSYNC) ? mode : mode - AIOAREAD);
+ aiorp->req_op = umode;
+
+ if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ aiorp->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+ aiorp->aio_sigevent.sigev_signo =
+ cb->aio_sigevent.sigev_signo;
+ aiorp->aio_sigevent.sigev_value.sival_ptr =
+ cb->aio_sigevent.sigev_value.sival_ptr;
+ }
+
+ if (sigp) {
+ /* SIGEV_PORT */
+ port_notify_t *pn = sigp->sigev_value.sival_ptr;
+ aiorp->aio_sigevent.sigev_notify = SIGEV_PORT;
+ aiorp->aio_sigevent.sigev_signo = pn->portnfy_port;
+ aiorp->aio_sigevent.sigev_value.sival_ptr = pn->portnfy_user;
+ } else if (cb->aio_sigevent.sigev_notify == SIGEV_PORT) {
+ port_notify_t *pn;
+ pn = cb->aio_sigevent.sigev_value.sival_ptr;
+ aiorp->aio_sigevent.sigev_notify = SIGEV_PORT;
+ aiorp->aio_sigevent.sigev_signo = pn->portnfy_port;
+ aiorp->aio_sigevent.sigev_value.sival_ptr = pn->portnfy_user;
+ }
+
+ aiorp->req_resultp = &cb->aio_resultp;
+ aiorp->req_iocb = cb;
+ ap = &(aiorp->req_args);
+ ap->fd = cb->aio_fildes;
+ ap->buf = (caddr_t)cb->aio_buf;
+ ap->bufsz = cb->aio_nbytes;
+ ap->offset = cb->aio_offset;
+
+ _aio_lock();
+ if ((flg & AIO_NO_DUPS) && _aio_hash_insert(&cb->aio_resultp, aiorp)) {
+ _aio_req_free(aiorp);
+ _aio_unlock();
+ errno = EINVAL;
+ return (-1);
+ } else {
+ _aio_unlock();
+ _aio_req_add(aiorp, nextworker, umode);
+ return (0);
+ }
+}
+
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+/*
+ * 64-bit AIO interface for POSIX
+ */
+int
+_aio_rw64(aiocb64_t *cb, aio_lio_t *lio_head, aio_worker_t **nextworker,
+ int mode, int flg, struct sigevent *sigp)
+{
+ aio_req_t *aiorp = NULL;
+ aio_args_t *ap = NULL;
+ int kerr;
+ int umode;
+
+ if (cb == NULL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /* initialize kaio */
+ if (!_kaio_ok)
+ _kaio_init();
+
+ cb->aio_state = NOCHECK;
+
+ /*
+ * If _aio_rw() is called because a list I/O
+ * kaio() failed, we dont want to repeat the
+ * system call
+ */
+
+ if (flg & AIO_KAIO) {
+ /*
+ * Try kernel aio first.
+ * If errno is ENOTSUP/EBADFD,
+ * fall back to the thread implementation.
+ */
+ if ((_kaio_ok > 0) && (KAIO_SUPPORTED(cb->aio_fildes))) {
+ cb->aio_resultp.aio_errno = EINPROGRESS;
+ cb->aio_state = CHECK;
+ kerr = (int)_kaio(mode, cb);
+ if (kerr == 0)
+ return (0);
+ else if ((errno != ENOTSUP) && (errno != EBADFD)) {
+ cb->aio_resultp.aio_errno = errno;
+ cb->aio_resultp.aio_return = -1;
+ cb->aio_state = NOCHECK;
+ return (-1);
+ }
+ if (errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(cb->aio_fildes);
+ }
+ }
+
+ cb->aio_resultp.aio_errno = EINPROGRESS;
+ cb->aio_state = USERAIO;
+
+ if (!__uaio_ok) {
+ if (__uaio_init() == -1)
+ return (-1);
+ }
+
+
+ aiorp = _aio_req_alloc();
+ if (aiorp == (aio_req_t *)-1) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /*
+ * If an LIO request, add the list head to the
+ * aio request
+ */
+ aiorp->lio_head = lio_head;
+ aiorp->req_type = AIO_POSIX_REQ;
+
+ /*
+ * _aio_do_request() needs the original request code to be able
+ * to choose the appropriate 32/64 bit function.
+ */
+ aiorp->req_op = mode;
+
+ if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+ aiorp->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+ aiorp->aio_sigevent.sigev_signo =
+ cb->aio_sigevent.sigev_signo;
+ aiorp->aio_sigevent.sigev_value.sival_ptr =
+ cb->aio_sigevent.sigev_value.sival_ptr;
+ }
+
+ if (sigp) {
+ /* SIGEV_PORT */
+ port_notify_t *pn = sigp->sigev_value.sival_ptr;
+ aiorp->aio_sigevent.sigev_notify = SIGEV_PORT;
+ aiorp->aio_sigevent.sigev_signo = pn->portnfy_port;
+ aiorp->aio_sigevent.sigev_value.sival_ptr = pn->portnfy_user;
+ } else if (cb->aio_sigevent.sigev_notify == SIGEV_PORT) {
+ port_notify_t *pn;
+ pn = cb->aio_sigevent.sigev_value.sival_ptr;
+ aiorp->aio_sigevent.sigev_notify = SIGEV_PORT;
+ aiorp->aio_sigevent.sigev_signo = pn->portnfy_port;
+ aiorp->aio_sigevent.sigev_value.sival_ptr = pn->portnfy_user;
+ }
+
+ aiorp->req_resultp = &cb->aio_resultp;
+ aiorp->req_iocb = (aiocb_t *)cb;
+ ap = &(aiorp->req_args);
+ ap->fd = cb->aio_fildes;
+ ap->buf = (caddr_t)cb->aio_buf;
+ ap->bufsz = cb->aio_nbytes;
+ ap->offset = cb->aio_offset;
+
+ _aio_lock();
+ if ((flg & AIO_NO_DUPS) && _aio_hash_insert(&cb->aio_resultp, aiorp)) {
+ _aio_req_free(aiorp);
+ _aio_unlock();
+ errno = EINVAL;
+ return (-1);
+ } else {
+ _aio_unlock();
+
+ /*
+ * _aio_req_add() only needs the difference between READ,
+ * WRITE and other to choose the right worker queue.
+ * AIOAREAD64 is mapped to AIOREAD and
+ * AIOAWRITE64 is mapped to AIOWRITE.
+ * mode is AIOAREAD64, AIOAWRITE64 or AIOFSYNC.
+ */
+ umode = ((mode == AIOFSYNC) ? mode : mode - AIOAREAD64);
+ _aio_req_add(aiorp, nextworker, umode);
+ return (0);
+ }
+}
+#endif /* (_LARGEFILE64_SOURCE) && !defined(_LP64) */
diff --git a/usr/src/lib/libaio/common/libaio.h b/usr/src/lib/libaio/common/libaio.h
new file mode 100644
index 0000000000..4c3d7a2af1
--- /dev/null
+++ b/usr/src/lib/libaio/common/libaio.h
@@ -0,0 +1,339 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBAIO_H
+#define _LIBAIO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <thread.h>
+#include <asynch.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <siginfo.h>
+#include <aio.h>
+#include <limits.h>
+#include <ucontext.h>
+
+#ifndef _REENTRANT
+#define _REENTRANT
+#endif
+
+#ifdef DEBUG
+extern int assfail(char *, char *, int);
+#define ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
+#else
+#define ASSERT(EX)
+#endif
+
+#define SIGAIOCANCEL SIGPROF /* special aio cancelation signal */
+#define AIO_WAITN_MAXIOCBS 32768 /* max. iocbs per system call */
+
+typedef struct aio_args {
+ int fd;
+ caddr_t buf;
+ size_t bufsz;
+ offset_t offset;
+} aio_args_t;
+
+/*
+ * list head for UFS list I/O
+ */
+typedef struct aio_lio {
+ char lio_mode; /* LIO_WAIT/LIO_NOWAIT */
+ int lio_nent; /* Number of list I/O's */
+ int lio_refcnt; /* outstanding I/O's */
+ cond_t lio_cond_cv; /* list notification for I/O done */
+ mutex_t lio_mutex; /* list mutex */
+ struct aio_lio *lio_next; /* pointer to next on freelist */
+ int lio_signo; /* Signal for LIO_NOWAIT */
+ union sigval lio_sigval; /* Signal parameter */
+ char lio_canned; /* lio was canceled */
+} aio_lio_t;
+
+/*
+ * size of aio_req should be power of 2. this helps to improve the
+ * effectiveness of the hashing function.
+ */
+typedef struct aio_req {
+ /*
+ * fields protected by _aio_mutex lock.
+ */
+ struct aio_req *req_link; /* hash chain link */
+ /*
+ * when req is on the doneq, then req_next is protected by
+ * the _aio_mutex lock. when the req is on a work q, then
+ * req_next is protected by a worker's work_qlock1 lock.
+ */
+ struct aio_req *req_next; /* request/done queue link */
+ struct aio_req *req_prev; /* double linked list */
+ /*
+ * condition variable that waits for a request to be
+ * canceled.
+ */
+ mutex_t req_lock; /* protects the following 2 fields */
+ cond_t req_cancv; /* cancel req condition variable */
+ char req_canned; /* set when canceled */
+ /*
+ * fields protected by a worker's work_qlock1 lock.
+ */
+ int req_state; /* AIO_REQ_QUEUED, ... */
+ /*
+ * fields require no locking.
+ */
+ int req_type; /* AIO_POSIX_REQ ? */
+ struct aio_worker *req_worker; /* associate req. with worker */
+ aio_result_t *req_resultp; /* address of result buffer */
+ int req_op; /* read or write */
+ aio_args_t req_args; /* arglist */
+ aio_lio_t *lio_head; /* list head for LIO */
+ int req_retval; /* resultp's retval */
+ int req_errno; /* resultp's errno */
+ char req_canwait; /* waiting for req to be canceled */
+ struct sigevent aio_sigevent;
+ int lio_signo; /* Signal for LIO_NOWAIT */
+ union sigval lio_sigval; /* Signal parameter */
+ aiocb_t *req_iocb; /* ptr to aiocb */
+} aio_req_t;
+
+/* special request type for handling sigevent notification */
+#define AIOSIGEV AIOFSYNC+1
+
+/* special lio type that destroys itself when lio refcnt becomes zero */
+#define LIO_FSYNC LIO_WAIT+1
+#define LIO_DESTROY LIO_FSYNC+1
+
+/* lio flags */
+#define LIO_FSYNC_CANCELED 0x1
+
+/* values for aios_state */
+
+#define AIO_REQ_QUEUED 1
+#define AIO_REQ_INPROGRESS 2
+#define AIO_REQ_CANCELED 3
+#define AIO_REQ_DONE 4
+#define AIO_REQ_FREE 5
+#define AIO_LIO_DONE 6
+#define AIO_REQ_DONEQ 7
+
+/* use KAIO in _aio_rw() */
+#define AIO_NO_KAIO 0x0
+#define AIO_KAIO 0x1
+#define AIO_NO_DUPS 0x2
+
+#define AIO_POSIX_REQ 0x1
+
+#define CHECK 1
+#define NOCHECK 2
+#define CHECKED 3
+#define USERAIO 4
+#define USERAIO_DONE 5
+
+/* values for _aio_flags */
+
+/*
+ * if set, _aiodone() notifies aio_waitn about done requests
+ * from the threads
+ */
+#define AIO_WAIT_INPROGRESS 0x1
+
+/*
+ * if set, _aiodone() wakes up functions waiting for completed I/Os
+ */
+#define AIO_IO_WAITING 0x2
+
+#define AIO_LIB_WAITN 0x4 /* aio_waitn in progress */
+#define AIO_LIB_WAITN_PENDING 0x8 /* aio_waitn requests pending */
+
+/*
+ * Before a kaio() system call, the fd will be checked
+ * to ensure that kernel async. I/O is supported for this file.
+ * The only way to find out is if a kaio() call returns ENOTSUP,
+ * so the default will always be to try the kaio() call. Only in
+ * the specific instance of a kaio() call returning ENOTSUP
+ * will we stop submitting kaio() calls for that fd.
+ * If the fd is outside the array bounds, we will allow the kaio()
+ * call.
+ *
+ * The only way that an fd entry can go from ENOTSUP to supported
+ * is if that fd is freed up by a close(), and close will clear
+ * the entry for that fd.
+ *
+ * Each fd gets a bit in the array _kaio_supported[].
+ *
+ * uint32_t _kaio_supported[MAX_KAIO_FDARRAY_SIZE];
+ *
+ * Array is MAX_KAIO_ARRAY_SIZE of 32-bit elements, for 4kb.
+ * If more than (MAX_KAIO_FDARRAY_SIZE * KAIO_FDARRAY_ELEM_SIZE )
+ * files are open, this can be expanded.
+ */
+
+#define MAX_KAIO_FDARRAY_SIZE 1024
+#define KAIO_FDARRAY_ELEM_SIZE WORD_BIT /* uint32_t */
+
+#define MAX_KAIO_FDS (MAX_KAIO_FDARRAY_SIZE * KAIO_FDARRAY_ELEM_SIZE)
+
+#define VALID_FD(fdes) (((fdes) >= 0) && ((fdes) < MAX_KAIO_FDS))
+
+#define KAIO_SUPPORTED(fdes) \
+ ((!VALID_FD(fdes)) || \
+ ((_kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] & \
+ (uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))) == 0))
+
+#define SET_KAIO_NOT_SUPPORTED(fdes) \
+ if (VALID_FD((fdes))) \
+ _kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] |= \
+ (uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))
+
+#define CLEAR_KAIO_SUPPORTED(fdes) \
+ if (VALID_FD((fdes))) \
+ _kaio_supported[(fdes) / KAIO_FDARRAY_ELEM_SIZE] &= \
+ ~(uint32_t)(1 << ((fdes) % KAIO_FDARRAY_ELEM_SIZE))
+
+typedef struct aio_worker {
+ /*
+ * fields protected by _aio_mutex lock
+ */
+ struct aio_worker *work_forw; /* forward link in list of workers */
+ struct aio_worker *work_backw; /* backwards link in list of workers */
+ /*
+ * fields require no locking.
+ */
+ thread_t work_tid; /* worker's thread-id */
+ mutex_t work_qlock1; /* lock for work queue 1 */
+ struct aio_req *work_head1; /* head of work request queue 1 */
+ struct aio_req *work_tail1; /* tail of work request queue 1 */
+ struct aio_req *work_next1; /* work queue one's next pointer */
+ struct aio_req *work_prev1; /* last request done from queue 1 */
+ int work_cnt1; /* length of work queue one */
+ int work_done1; /* number of requests done */
+ int work_minload1; /* min length of queue */
+ struct aio_req *work_req; /* active work request */
+ int work_idleflg; /* when set, worker is idle */
+ cond_t work_idle_cv; /* place to sleep when idle */
+ mutex_t work_lock; /* protects work flags */
+ sigjmp_buf work_jmp_buf; /* cancellation point */
+ char work_cancel_flg; /* flag set when at cancellation pt */
+} aio_worker_t;
+
+extern void _kaio_init(void);
+extern intptr_t _kaio(int, ...);
+extern int _aiorw(int, caddr_t, int, offset_t, int, aio_result_t *, int);
+extern int _aio_rw(aiocb_t *, aio_lio_t *, aio_worker_t **, int, int,
+ struct sigevent *);
+extern int __aio_fsync(int, aiocb_t *);
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+extern int _aio_rw64(aiocb64_t *, aio_lio_t *, aio_worker_t **, int, int,
+ struct sigevent *);
+extern int __aio_fsync64(int, aiocb64_t *);
+#endif
+extern int aiocancel_all(int);
+extern int _aio_create_worker(aio_req_t *, int);
+extern void *_aio_send_sigev(void *);
+
+extern void _aio_cancel_on(aio_worker_t *);
+extern void _aio_cancel_off(aio_worker_t *);
+extern int _aio_cancel_req(aio_worker_t *, aio_req_t *, int *, int *);
+
+extern void _aio_forkinit(void);
+extern void _aiopanic(char *);
+extern void _aio_lock(void);
+extern void _aio_unlock(void);
+extern void _aio_req_free(aio_req_t *);
+extern aio_req_t *_aio_hash_del(aio_result_t *);
+extern int _fill_aiocache(int);
+
+extern aio_worker_t *_aio_alloc_worker(void);
+extern void _aio_free_worker(void *);
+
+extern void _aio_idle(struct aio_worker *);
+extern void __aiosendsig(void);
+extern void *_aio_do_request(void *);
+extern void _aio_remove(aio_req_t *);
+extern void _lio_remove(aio_lio_t *);
+extern aio_req_t *_aio_req_remove(aio_req_t *);
+extern int _aio_get_timedelta(struct timespec *, struct timespec *);
+
+extern int _close(int);
+extern int __sigqueue(pid_t pid, int signo,
+ /* const union sigval */ void *value, int si_code);
+extern pid_t _fork(void);
+extern int _sigaction(int sig, const struct sigaction *act,
+ struct sigaction *oact);
+extern int _sigemptyset(sigset_t *set);
+extern int _sigaddset(sigset_t *set, int signo);
+extern int _sigismember(sigset_t *set, int signo);
+extern int _sigprocmask(int how, const sigset_t *set, sigset_t *oset);
+extern void aiosigcancelhndlr(int, siginfo_t *, void *);
+
+extern aio_worker_t *__nextworker_rd; /* worker chosen for next rd request */
+extern aio_worker_t *__workers_rd; /* list of all rd workers */
+extern int __rd_workerscnt; /* number of rd workers */
+extern aio_worker_t *__nextworker_wr; /* worker chosen for next wr request */
+extern aio_worker_t *__workers_wr; /* list of all wr workers */
+extern int __wr_workerscnt; /* number of wr workers */
+extern aio_worker_t *__nextworker_si; /* worker chosen for next si request */
+extern aio_worker_t *__workers_si; /* list of all si workers */
+extern int __si_workerscnt; /* number of si workers */
+extern int __aiostksz; /* stack size for workers */
+extern mutex_t __aio_mutex; /* global aio lock that's SIGIO-safe */
+extern mutex_t __lio_mutex; /* global lio lock */
+extern int _max_workers; /* max number of workers permitted */
+extern int _min_workers; /* min number of workers */
+extern sigset_t _worker_set; /* worker's signal mask */
+extern int _aio_worker_cnt; /* number of AIO workers */
+extern int _sigio_enabled; /* when set, send SIGIO signal */
+extern int __sigio_pending; /* count of pending SIGIO signals */
+extern int __sigio_masked; /* when set, SIGIO is masked */
+extern int __sigio_maskedcnt; /* count number times bit mask is set */
+extern pid_t __pid; /* process's PID */
+extern int _kaio_ok; /* indicates if kaio is initialized */
+extern thread_key_t _aio_key; /* for thread-specific data */
+extern struct sigaction sigcanact; /* action for SIGAIOCANCEL */
+extern int _pagesize;
+
+/*
+ * Array for determining whether or not a file supports kaio
+ *
+ */
+extern uint32_t _kaio_supported[];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBAIO_H */
diff --git a/usr/src/lib/libaio/common/llib-laio b/usr/src/lib/libaio/common/llib-laio
new file mode 100644
index 0000000000..9bf215ab3a
--- /dev/null
+++ b/usr/src/lib/libaio/common/llib-laio
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* LINTLIBRARY */
+/* PROTOLIB1 */
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <libaio.h>
+
+/*
+ * usr/src/lib/libaio/common
+ */
+
+/* aio.c */
+int aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
+ aio_result_t *resultp);
+int aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
+ aio_result_t *resultp);
+int aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
+ aio_result_t *resultp);
+int aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
+ aio_result_t *resultp);
+int aiocancel(aio_result_t *resultp);
+aio_result_t *aiowait(struct timeval *uwait);
+
+/* scalls.c */
+int _libaio_close(int fd);
+pid_t _libaio_fork(void);
+
+/* ma.c */
+
+/* posix_aio.c */
+int __aio_read(aiocb_t *cb);
+int __aio_write(aiocb_t *cb);
+int __lio_listio(int mode, aiocb_t * const list[],
+ int nent, struct sigevent *sig);
+int __aio_suspend(void **list, int nent, const timespec_t *timo, int lf);
+int __aio_error(aiocb_t *cb);
+ssize_t __aio_return(aiocb_t *cb);
+int __aio_fsync(int op, aiocb_t *aiocbp);
+int __aio_cancel(int fd, aiocb_t *aiocbp);
+int __aio_waitn(void **list, uint_t nent, uint_t *nwait,
+ const struct timespec *timeout, int mode);
+int __aio_read64(aiocb64_t *cb);
+int __aio_write64(aiocb64_t *cb);
+int __lio_listio64(int mode, aiocb64_t *const list[],
+ int nent, struct sigevent *sig);
+int __aio_error64(aiocb64_t *cb);
+ssize_t __aio_return64(aiocb64_t *cb);
+int __aio_fsync64(int op, aiocb64_t *aiocbp);
+int __aio_cancel64(int fd, aiocb64_t *aiocbp);
+
+/* sig.c */
+
+/* subr.c */
+int assfail(char *a, char *f, int l);
diff --git a/usr/src/lib/libaio/common/ma.c b/usr/src/lib/libaio/common/ma.c
new file mode 100644
index 0000000000..07f540f9c1
--- /dev/null
+++ b/usr/src/lib/libaio/common/ma.c
@@ -0,0 +1,60 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1992-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+#include "libaio.h"
+
+/*
+ * Allocate a worker control block.
+ * We just use malloc(), like everywhere else in libaio.
+ * A more sophisticated allocator could be used, but oh well...
+ */
+aio_worker_t *
+_aio_alloc_worker()
+{
+ aio_worker_t *aiowp;
+
+ aiowp = malloc(sizeof (aio_worker_t));
+ if (aiowp != NULL) {
+ (void) memset(aiowp, 0, sizeof (aio_worker_t));
+ (void) mutex_init(&aiowp->work_qlock1, USYNC_THREAD, NULL);
+ (void) mutex_init(&aiowp->work_lock, USYNC_THREAD, NULL);
+ (void) cond_init(&aiowp->work_idle_cv, USYNC_THREAD, NULL);
+ }
+ return (aiowp);
+}
+
+/*
+ * Free a worker control block.
+ * Declared with void *arg so it can be a thr_keycreate() destructor.
+ */
+void
+_aio_free_worker(void *arg)
+{
+ free(arg);
+}
diff --git a/usr/src/lib/libaio/common/posix_aio.c b/usr/src/lib/libaio/common/posix_aio.c
new file mode 100644
index 0000000000..15155fceeb
--- /dev/null
+++ b/usr/src/lib/libaio/common/posix_aio.c
@@ -0,0 +1,1720 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * posix_aio.c implements the POSIX async. I/O
+ * functions for librt
+ *
+ * aio_read
+ * aio_write
+ * aio_error
+ * aio_return
+ * aio_suspend
+ * lio_listio
+ * aio_fsync
+ * aio_cancel
+ */
+
+#include "libaio.h"
+#include <sys/file.h>
+
+extern int __fdsync(int, int);
+extern aio_req_t *_aio_hash_find(aio_result_t *);
+
+/* __aio_suspend stuff */
+
+extern int _aio_kernel_suspend;
+extern int _aio_suscv_cnt;
+
+/* __aio_waitn stuff */
+
+static mutex_t __aio_waitn_mutex = DEFAULTMUTEX; /* 1 aio_waitn per process */
+static cond_t _aio_waitn_cv = DEFAULTCV; /* wait for end of aio_waitn */
+extern int _aio_flags;
+extern cond_t _aio_iowait_cv;
+extern int _aio_doneq_cnt;
+extern int _aio_outstand_cnt;
+extern int _aio_waitncnt;
+
+static int _aio_check_timeout(const struct timespec *, struct timespec *,
+ int *);
+
+/* defines for timedwait in __aio_waitn() and __aio_suspend() */
+#define AIO_TIMEOUT_INDEF -1
+#define AIO_TIMEOUT_POLL 0
+#define AIO_TIMEOUT_WAIT 1
+#define AIO_TIMEOUT_UNDEF 2
+
+/*
+ * List I/O list head stuff
+ */
+static aio_lio_t *_lio_head_freelist = NULL;
+static int _aio_lio_alloc(aio_lio_t **);
+static void _aio_lio_free(aio_lio_t *);
+static void _lio_list_decr(aio_lio_t *);
+
+int
+__aio_read(aiocb_t *cb)
+{
+ aio_lio_t *head = NULL;
+
+ if ((cb == NULL) || cb->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ cb->aio_lio_opcode = LIO_READ;
+ return (_aio_rw(cb, head, &__nextworker_rd, AIOAREAD,
+ (AIO_KAIO | AIO_NO_DUPS), NULL));
+}
+
+int
+__aio_write(aiocb_t *cb)
+{
+ aio_lio_t *head = NULL;
+
+ if ((cb == NULL) || cb->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ cb->aio_lio_opcode = LIO_WRITE;
+ return (_aio_rw(cb, head, &__nextworker_wr, AIOAWRITE,
+ (AIO_KAIO | AIO_NO_DUPS), NULL));
+}
+
+
+int
+__lio_listio(int mode, aiocb_t * const list[],
+ int nent, struct sigevent *sig)
+{
+ int i, err;
+ int aio_ufs = 0;
+ int oerrno = 0;
+ aio_lio_t *head = NULL;
+ int state = 0;
+ static long aio_list_max = 0;
+ aio_worker_t **nextworker;
+ int EIOflg = 0;
+ int rw;
+ int do_kaio = 0;
+
+ if (!_kaio_ok)
+ _kaio_init();
+
+ if (aio_list_max == 0)
+ aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
+
+ if (nent < 0 || (long)nent > aio_list_max) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ switch (mode) {
+ case LIO_WAIT:
+ state = NOCHECK;
+ break;
+ case LIO_NOWAIT:
+ state = CHECK;
+ break;
+ default:
+ errno = EINVAL;
+ return (-1);
+ }
+
+ for (i = 0; i < nent; i++) {
+ if (list[i]) {
+ if (list[i]->aio_lio_opcode != LIO_NOP) {
+ list[i]->aio_state = state;
+ if (KAIO_SUPPORTED(list[i]->aio_fildes))
+ do_kaio++;
+ else
+ list[i]->aio_resultp.aio_errno =
+ ENOTSUP;
+ } else
+ list[i]->aio_state = NOCHECK;
+ }
+ }
+
+ if (do_kaio) {
+ if ((err = (int)_kaio(AIOLIO, mode, list, nent, sig)) == 0)
+ return (0);
+ oerrno = errno;
+ } else {
+ oerrno = errno = ENOTSUP;
+ err = -1;
+ }
+ if ((err == -1) && (errno == ENOTSUP)) {
+ err = errno = 0;
+ /*
+ * If LIO_WAIT, or signal required, allocate a list head.
+ */
+ if ((mode == LIO_WAIT) || ((sig) &&
+ (sig->sigev_notify == SIGEV_SIGNAL)))
+ (void) _aio_lio_alloc(&head);
+ if (head) {
+ (void) mutex_lock(&head->lio_mutex);
+ head->lio_mode = (char)mode;
+ if ((mode == LIO_NOWAIT) && (sig) &&
+ (sig->sigev_notify != SIGEV_NONE) &&
+ (sig->sigev_signo > 0)) {
+ head->lio_signo = sig->sigev_signo;
+ head->lio_sigval.sival_ptr =
+ sig->sigev_value.sival_ptr;
+ } else
+ head->lio_signo = 0;
+ head->lio_nent = head->lio_refcnt = nent;
+ (void) mutex_unlock(&head->lio_mutex);
+ }
+ /*
+ * find UFS requests, errno == ENOTSUP/EBADFD,
+ */
+ for (i = 0; i < nent; i++) {
+ if (list[i] &&
+ ((list[i]->aio_resultp.aio_errno == ENOTSUP) ||
+ (list[i]->aio_resultp.aio_errno == EBADFD))) {
+ if (list[i]->aio_lio_opcode == LIO_NOP) {
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ if (list[i]->aio_resultp.aio_errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(
+ list[i]->aio_fildes);
+ if (list[i]->aio_reqprio < 0) {
+ list[i]->aio_resultp.aio_errno =
+ EINVAL;
+ list[i]->aio_resultp.aio_return = -1;
+ EIOflg = 1;
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ /*
+ * submit an AIO request with flags AIO_NO_KAIO
+ * to avoid the kaio() syscall in _aio_rw()
+ */
+ switch (list[i]->aio_lio_opcode) {
+ case LIO_READ:
+ rw = AIOAREAD;
+ nextworker = &__nextworker_rd;
+ break;
+ case LIO_WRITE:
+ rw = AIOAWRITE;
+ nextworker = &__nextworker_wr;
+ break;
+ }
+ if (sig && sig->sigev_notify == SIGEV_PORT)
+ err = _aio_rw(list[i], head, nextworker,
+ rw, (AIO_NO_KAIO | AIO_NO_DUPS),
+ sig);
+ else
+ err = _aio_rw(list[i], head, nextworker,
+ rw, (AIO_NO_KAIO | AIO_NO_DUPS),
+ NULL);
+ if (err != 0) {
+ if (head)
+ _lio_list_decr(head);
+ list[i]->aio_resultp.aio_errno = err;
+ EIOflg = 1;
+ } else
+ aio_ufs++;
+
+ } else {
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ }
+ }
+ if (EIOflg) {
+ errno = EIO;
+ return (-1);
+ }
+ if ((mode == LIO_WAIT) && (oerrno == ENOTSUP)) {
+ /*
+ * call kaio(AIOLIOWAIT) to get all outstanding
+ * kernel AIO requests
+ */
+ if ((nent - aio_ufs) > 0) {
+ (void) _kaio(AIOLIOWAIT, mode, list, nent, sig);
+ }
+ if (head && head->lio_nent > 0) {
+ (void) mutex_lock(&head->lio_mutex);
+ while (head->lio_refcnt > 0) {
+ errno = cond_wait(&head->lio_cond_cv,
+ &head->lio_mutex);
+ if (errno) {
+ (void) mutex_unlock(&head->lio_mutex);
+ return (-1);
+ }
+ }
+ (void) mutex_unlock(&head->lio_mutex);
+ for (i = 0; i < nent; i++) {
+ if (list[i] &&
+ list[i]->aio_resultp.aio_errno) {
+ errno = EIO;
+ return (-1);
+ }
+ }
+ }
+ return (0);
+ }
+ return (err);
+}
+
+static void
+_lio_list_decr(aio_lio_t *head)
+{
+ (void) mutex_lock(&head->lio_mutex);
+ head->lio_nent--;
+ head->lio_refcnt--;
+ (void) mutex_unlock(&head->lio_mutex);
+}
+
+extern void _cancelon(void);
+extern void _canceloff(void);
+
+int
+__aio_suspend(void **list, int nent, const timespec_t *timo, int largefile)
+{
+ int cv_err; /* error code from cond_xxx() */
+ int kerr; /* error code from _kaio(AIOSUSPEND) */
+ int i;
+ struct timespec twait; /* copy of timo for internal calculations */
+ struct timespec *wait = NULL;
+ int timedwait;
+ int req_outstanding;
+ aiocb_t **listp;
+ aiocb64_t **listp64;
+ hrtime_t hrtstart;
+ hrtime_t hrtend;
+ hrtime_t hrtres;
+
+ if (nent <= 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if (timo) {
+ if (timo->tv_sec < 0 || timo->tv_nsec < 0 ||
+ timo->tv_nsec >= NANOSEC) {
+ errno = EINVAL;
+ return (-1);
+ }
+ /* Initialize start time if time monitoring desired */
+ if (timo->tv_sec > 0 || timo->tv_nsec > 0) {
+ timedwait = AIO_TIMEOUT_WAIT;
+ hrtstart = gethrtime();
+ } else {
+ /* content of timeout = 0 : polling */
+ timedwait = AIO_TIMEOUT_POLL;
+ }
+ } else {
+ /* timeout pointer = NULL : wait indefinitely */
+ timedwait = AIO_TIMEOUT_INDEF;
+ }
+
+ if (largefile) {
+ /* _LARGEFILE64_SOURCE && !_LP64 */
+ listp64 = (aiocb64_t **)list;
+ for (i = 0; i < nent; i++) {
+ if (listp64[i] && listp64[i]->aio_state == CHECK)
+ listp64[i]->aio_state = CHECKED;
+ }
+ } else {
+ listp = (aiocb_t **)list;
+ for (i = 0; i < nent; i++) {
+ if (listp[i] && listp[i]->aio_state == CHECK)
+ listp[i]->aio_state = CHECKED;
+ }
+ }
+
+ /*
+ * The next "if -case" is required to accelerate the
+ * access to completed RAW-IO requests.
+ */
+
+ if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
+ /* Only kernel requests pending */
+
+ _cancelon();
+
+ /*
+ * _aio_kernel_suspend is used to detect completed non RAW-IO
+ * requests.
+ * As long as this thread resides in the kernel (_kaio) further
+ * asynchronous non RAW-IO requests could be submitted.
+ */
+ _aio_lock();
+ _aio_kernel_suspend++;
+ _aio_unlock();
+
+ /*
+ * Always do the kaio() call without using the KAIO_SUPPORTED()
+ * checks because it is not mandatory to have a valid fd
+ * set in the list entries, only the resultp must be set.
+ *
+ * _kaio(AIOSUSPEND ...) return values :
+ * 0: everythink ok, completed request found
+ * -1: error
+ * 1: no error : _aiodone awaked the _kaio(AIOSUSPEND,,)
+ * system call using _kaio(AIONOTIFY). It means, that some
+ * non RAW-IOs completed inbetween.
+ */
+
+ if (largefile)
+ kerr = (int)_kaio(AIOSUSPEND64, list, nent, timo, -1);
+ else
+ kerr = (int)_kaio(AIOSUSPEND, list, nent, timo, -1);
+
+ _aio_lock();
+ _aio_kernel_suspend--;
+ _aio_unlock();
+
+ _canceloff();
+ if (!kerr)
+ return (0);
+ } else {
+ kerr = 1; /* simulation: _kaio detected AIONOTIFY */
+ }
+
+ /* Return kernel error code, if no other IOs are outstanding */
+
+ _aio_lock();
+ req_outstanding = _aio_doneq_cnt + _aio_outstand_cnt;
+ _aio_unlock();
+
+ if (req_outstanding == 0) {
+ /* no IOs outstanding in the thread pool */
+ if (kerr == 1)
+ /* return "no IOs completed" */
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ /* IOs using the thread pool are outstanding */
+
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ /* time monitoring */
+ hrtend = hrtstart + (hrtime_t)timo->tv_sec * (hrtime_t)NANOSEC +
+ (hrtime_t)timo->tv_nsec;
+ hrtres = hrtend - gethrtime();
+ if (hrtres <= 0)
+ hrtres = 1;
+ twait.tv_sec = hrtres / (hrtime_t)NANOSEC;
+ twait.tv_nsec = hrtres % (hrtime_t)NANOSEC;
+ wait = &twait;
+ } else {
+ if (timedwait == AIO_TIMEOUT_POLL) {
+ twait = *timo; /* content of timo = 0 : polling */
+ wait = &twait;
+ }
+ }
+
+ for (;;) {
+ int aio_errno;
+ int aio_inprogress;
+
+ /* first scan file system requests */
+ aio_inprogress = 0;
+ if (largefile) {
+ for (i = 0; i < nent; i++) {
+ if (listp64[i] == NULL)
+ continue;
+ aio_errno = listp64[i]->aio_resultp.aio_errno;
+ if (aio_errno == EINPROGRESS) {
+ aio_inprogress = 1;
+ } else {
+ if (aio_errno != ECANCELED) {
+ errno = 0;
+ return (0);
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < nent; i++) {
+ if (listp[i] == NULL)
+ continue;
+ aio_errno = listp[i]->aio_resultp.aio_errno;
+ if (aio_errno == EINPROGRESS) {
+ aio_inprogress = 1;
+ } else {
+ if (aio_errno != ECANCELED) {
+ errno = 0;
+ return (0);
+ }
+ }
+ }
+ }
+
+ /*
+ * If there aren't outstanding I/Os in the thread pool then
+ * we have to return here, provided that all kernel RAW-IOs
+ * also completed.
+ * If the kernel was notified to return, then we have to check
+ * possible pending RAW-IOs.
+ */
+ if (_aio_outstand_cnt == 0 && aio_inprogress == 0 &&
+ kerr != 1) {
+ errno = EAGAIN;
+ break;
+ }
+
+ /*
+ * There are outstanding IOs in the thread pool or the kernel
+ * was notified to return.
+ * Check pending RAW-IOs first.
+ */
+ if (kerr == 1) {
+ /*
+ * _aiodone just notified the kernel about
+ * completed non RAW-IOs (AIONOTIFY was detected).
+ */
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ /* Update remaining timeout for the kernel */
+ hrtres = hrtend - gethrtime();
+ if (hrtres <= 0) {
+ /* timer expired */
+ errno = EAGAIN;
+ break;
+ }
+ wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
+ wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
+ }
+ _aio_lock();
+ _aio_kernel_suspend++;
+ _aio_unlock();
+
+ _cancelon();
+ if (largefile)
+ kerr = (int)_kaio(AIOSUSPEND64, list, nent,
+ wait, -1);
+ else
+ kerr = (int)_kaio(AIOSUSPEND, list, nent,
+ wait, -1);
+ _canceloff();
+
+ _aio_lock();
+ _aio_kernel_suspend--;
+ _aio_unlock();
+
+ if (!kerr) {
+ return (0);
+ }
+ }
+
+ if (timedwait == AIO_TIMEOUT_POLL) {
+ errno = EAGAIN;
+ break;
+ }
+
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ /* Update remaining timeout */
+ hrtres = hrtend - gethrtime();
+ if (hrtres <= 0) {
+ /* timer expired */
+ errno = EAGAIN;
+ break;
+ }
+ wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
+ wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
+ }
+
+ _aio_lock();
+ if (_aio_outstand_cnt == 0) {
+ _aio_unlock();
+ continue;
+ }
+
+ _aio_suscv_cnt++; /* ID for _aiodone (wake up) */
+
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ cv_err = cond_reltimedwait(&_aio_iowait_cv,
+ &__aio_mutex, wait);
+
+ if (cv_err == ETIME)
+ cv_err = EAGAIN;
+ } else {
+ /* wait indefinitely */
+ cv_err = cond_wait(&_aio_iowait_cv, &__aio_mutex);
+ }
+
+ _aio_suscv_cnt--;
+ _aio_unlock();
+
+ if (cv_err) {
+ errno = cv_err;
+ break;
+ }
+ }
+ return (-1);
+}
+
+int
+__aio_error(aiocb_t *cb)
+{
+ aio_req_t *reqp;
+ int aio_errno = cb->aio_resultp.aio_errno;
+
+ if (aio_errno == EINPROGRESS) {
+ if (cb->aio_state == CHECK) {
+ /*
+ * Always do the kaio() call without using
+ * the KAIO_SUPPORTED()
+ * checks because it is not mandatory to
+ * have a valid fd
+ * set in the aiocb, only the resultp must be set.
+ */
+ if (((int)_kaio(AIOERROR, cb)) == EINVAL) {
+ errno = EINVAL;
+ return (-1);
+ }
+ } else if (cb->aio_state == CHECKED)
+ cb->aio_state = CHECK;
+ } else if (cb->aio_state == USERAIO) {
+ _aio_lock();
+ if (reqp = _aio_hash_find(&cb->aio_resultp)) {
+ cb->aio_state = NOCHECK;
+ _lio_remove(reqp->lio_head);
+ (void) _aio_hash_del(reqp->req_resultp);
+ (void) _aio_req_remove(reqp);
+ _aio_req_free(reqp);
+ }
+ _aio_unlock();
+ }
+ return (aio_errno);
+}
+
+ssize_t
+__aio_return(aiocb_t *cb)
+{
+ ssize_t ret;
+ aio_req_t *reqp;
+
+ /*
+ * graceful detection of an invalid cb is not possible. a
+ * SIGSEGV will be generated if it is invalid.
+ */
+ if (cb == NULL) {
+ errno = EINVAL;
+ exit(-1);
+ }
+
+ /*
+ * we use this condition to indicate that
+ * aio_return has been called before
+ */
+ if (cb->aio_resultp.aio_return == -1 &&
+ cb->aio_resultp.aio_errno == EINVAL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /*
+ * Before we return mark the result as being returned so that later
+ * calls to aio_return() will return the fact that the result has
+ * already been returned
+ */
+ ret = cb->aio_resultp.aio_return;
+ cb->aio_resultp.aio_return = -1;
+ cb->aio_resultp.aio_errno = EINVAL;
+ if (cb->aio_state == USERAIO) {
+ _aio_lock();
+ if (reqp = _aio_hash_find(&cb->aio_resultp)) {
+ cb->aio_state = NOCHECK;
+ _lio_remove(reqp->lio_head);
+ (void) _aio_hash_del(reqp->req_resultp);
+ (void) _aio_req_remove(reqp);
+ _aio_req_free(reqp);
+ }
+ _aio_unlock();
+ }
+ return (ret);
+
+}
+
+void
+_lio_remove(aio_lio_t *head)
+{
+ int refcnt;
+
+ if (head) {
+ (void) mutex_lock(&head->lio_mutex);
+ refcnt = --head->lio_nent;
+ (void) mutex_unlock(&head->lio_mutex);
+ if (!refcnt)
+ _aio_lio_free(head);
+ }
+}
+
+void
+_aio_remove(aio_req_t *reqp)
+{
+ _lio_remove(reqp->lio_head);
+ _aio_lock();
+ (void) _aio_hash_del(reqp->req_resultp);
+ (void) _aio_req_remove(reqp);
+ _aio_req_free(reqp);
+ _aio_unlock();
+}
+
+int
+_aio_lio_alloc(aio_lio_t **head)
+{
+ aio_lio_t *lio_head;
+
+ (void) mutex_lock(&__lio_mutex);
+ if (_lio_head_freelist == NULL) {
+ lio_head = (aio_lio_t *)malloc(sizeof (aio_lio_t));
+ } else {
+ lio_head = _lio_head_freelist;
+ _lio_head_freelist = lio_head->lio_next;
+ }
+ if (lio_head == NULL) {
+ (void) mutex_unlock(&__lio_mutex);
+ return (-1);
+ }
+ (void) memset(lio_head, 0, sizeof (aio_lio_t));
+ (void) cond_init(&lio_head->lio_cond_cv, USYNC_THREAD, NULL);
+ (void) mutex_init(&lio_head->lio_mutex, USYNC_THREAD, NULL);
+ *head = lio_head;
+ (void) mutex_unlock(&__lio_mutex);
+ return (0);
+}
+
+void
+_aio_lio_free(aio_lio_t *head)
+{
+ (void) mutex_lock(&__lio_mutex);
+ head->lio_next = _lio_head_freelist;
+ _lio_head_freelist = head;
+ (void) mutex_unlock(&__lio_mutex);
+}
+
+/*
+ * This function returns the number of asynchronous I/O requests submitted.
+ */
+
+static int
+__aio_fsync_bar(aiocb_t *cb, aio_lio_t *head, aio_worker_t *aiowp,
+ int workerscnt)
+{
+ int i;
+ int err;
+ aio_worker_t *next = aiowp;
+
+ for (i = 0; i < workerscnt; i++) {
+ err = _aio_rw(cb, head, &next, AIOFSYNC, AIO_NO_KAIO, NULL);
+ if (err != 0) {
+ (void) mutex_lock(&head->lio_mutex);
+ head->lio_mode = LIO_DESTROY; /* ignore fsync */
+ head->lio_nent -= workerscnt - i;
+ head->lio_refcnt -= workerscnt - i;
+ (void) mutex_unlock(&head->lio_mutex);
+ errno = EAGAIN;
+ return (i);
+ }
+ next = next->work_forw;
+ }
+ return (i);
+}
+
+/*
+ * This function is called from aio_fsync(3RT).
+ */
+
+int
+__aio_fsync(int op, aiocb_t *cb)
+{
+ struct stat buf;
+ aio_lio_t *head;
+ int retval;
+
+ if (cb == NULL) {
+ return (0);
+ }
+
+ if ((op != O_DSYNC) && (op != O_SYNC)) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if (fstat(cb->aio_fildes, &buf) < 0)
+ return (-1);
+
+ /*
+ * The first asynchronous I/O request in the current process
+ * will create a bunch of workers.
+ * If the sum of workers (read + write) is zero then the
+ * number of pending asynchronous I/O requests is zero.
+ * In such a case only execute the standard fsync(3C) or
+ * fdatasync(3RT) as appropriate (see flag of __fdsync()).
+ */
+ if ((__wr_workerscnt + __rd_workerscnt) == 0) {
+ if (op == O_DSYNC)
+ return (__fdsync(cb->aio_fildes, FDSYNC));
+ else
+ return (__fdsync(cb->aio_fildes, FSYNC));
+ }
+
+ /*
+ * re-use aio_offset as the op field.
+ * O_DSYNC - fdatasync()
+ * O_SYNC - fsync()
+ */
+ cb->aio_offset = op;
+ cb->aio_lio_opcode = AIOFSYNC;
+
+ /*
+ * create a list of fsync requests. the worker
+ * that gets the last request will do the fsync
+ * request.
+ */
+ (void) _aio_lio_alloc(&head);
+ if (head == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+ head->lio_mode = LIO_FSYNC;
+ head->lio_signo = 0;
+ head->lio_nent = head->lio_refcnt = __wr_workerscnt + __rd_workerscnt;
+ /* insert an fsync request on every read workers' queue. */
+ retval = __aio_fsync_bar(cb, head, __workers_rd, __rd_workerscnt);
+ if (retval != __rd_workerscnt) {
+ /*
+ * Less fsync requests than workers means that
+ * it was not possible to submit fsync requests to all
+ * workers.
+ * Actions:
+ * a) number of fsync requests submitted is 0:
+ * => free allocated memory (aio_lio_t).
+ * b) number of fsync requests submitted is > 0:
+ * => the last worker executing the fsync request
+ * will free the aio_lio_t struct.
+ */
+ if (retval == 0)
+ _aio_lio_free(head);
+ return (-1);
+ }
+
+ /* insert an fsync request on every write workers' queue. */
+ retval = __aio_fsync_bar(cb, head, __workers_wr, __wr_workerscnt);
+ if (retval != __wr_workerscnt)
+ return (-1);
+ return (0);
+}
+
+int
+__aio_cancel(int fd, aiocb_t *cb)
+{
+ aio_req_t *rp;
+ aio_worker_t *aiowp;
+ int done = 0;
+ int canceled = 0;
+ struct stat buf;
+
+ if (fstat(fd, &buf) < 0)
+ return (-1);
+
+ if (cb != NULL) {
+ if (cb->aio_state == USERAIO) {
+ _aio_lock();
+ rp = _aio_hash_find(&cb->aio_resultp);
+ if (rp == NULL) {
+ _aio_unlock();
+ return (AIO_ALLDONE);
+ } else {
+ aiowp = rp->req_worker;
+ (void) mutex_lock(&aiowp->work_qlock1);
+ (void) _aio_cancel_req(aiowp, rp, &canceled,
+ &done);
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ _aio_unlock();
+ if (done)
+ return (AIO_ALLDONE);
+ else if (canceled)
+ return (AIO_CANCELED);
+ else
+ return (AIO_NOTCANCELED);
+ }
+ }
+
+ if (cb->aio_state == USERAIO_DONE)
+ return (AIO_ALLDONE);
+
+ return ((int)_kaio(AIOCANCEL, fd, cb));
+ }
+
+ return (aiocancel_all(fd));
+}
+
+
+/*
+ * aio_waitn can be used to reap the results of several I/O operations that
+ * were submitted asynchronously. The submission of I/Os can be done using
+ * existing POSIX interfaces: lio_listio, aio_write or aio_read.
+ * aio_waitn waits until "nwait" I/Os (supplied as a parameter) have
+ * completed and it returns the descriptors for these I/Os in "list". The
+ * maximum size of this list is given by "nent" and the actual number of I/Os
+ * completed is returned in "nwait". Otherwise aio_waitn might also
+ * return if the timeout expires. Additionally, aio_waitn returns 0 if
+ * successful or -1 if an error occurred.
+ */
+
+/*ARGSUSED*/
+int
+__aio_waitn(void **list, uint_t nent, uint_t *nwait,
+ const struct timespec *utimo, int largefile)
+{
+ int err = 0;
+ uint_t dnwait = 0; /* amount of requests in the waitn-done list */
+ uint_t kwaitcnt; /* expected "done" requests from kernel */
+ uint_t knentcnt; /* max. expected "done" requests from kernel */
+ int uerrno = 0;
+ int kerrno = 0; /* save errno from _kaio() call */
+ int timedwait = AIO_TIMEOUT_UNDEF;
+ aio_req_t *aiorp;
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+ aiocb64_t *aiop64;
+#endif
+ struct timespec end;
+ struct timespec twait; /* copy of utimo for internal calculations */
+ struct timespec *wait = NULL;
+
+ if (nent == 0 || *nwait == 0 || *nwait > nent) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if (nwait == NULL) {
+ errno = EFAULT;
+ return (-1);
+ }
+
+ /*
+ * Only one running aio_waitn call per process allowed.
+ * Further calls will be blocked here until the running
+ * call finishes.
+ */
+
+ (void) mutex_lock(&__aio_waitn_mutex);
+
+ while (_aio_flags & AIO_LIB_WAITN) {
+
+ if (utimo && utimo->tv_sec == 0 && utimo->tv_nsec == 0) {
+ (void) mutex_unlock(&__aio_waitn_mutex);
+ *nwait = 0;
+ return (0);
+ }
+
+ _aio_flags |= AIO_LIB_WAITN_PENDING;
+ err = cond_wait(&_aio_waitn_cv, &__aio_waitn_mutex);
+ if (err != 0) {
+ (void) mutex_unlock(&__aio_waitn_mutex);
+ *nwait = 0;
+ errno = err;
+ return (-1);
+ }
+ }
+
+ _aio_flags |= AIO_LIB_WAITN;
+
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ if (*nwait >= AIO_WAITN_MAXIOCBS) {
+ err = _aio_check_timeout(utimo, &end, &timedwait);
+ if (err) {
+ *nwait = 0;
+ return (-1);
+ }
+
+ if (timedwait != AIO_TIMEOUT_INDEF) {
+ twait = *utimo;
+ wait = &twait;
+ }
+ }
+
+ /*
+ * _aio_lock() is not required at this time, but the
+ * condition is that "_aio_doneq_cnt" has to be updated
+ * before "_aio_outstand_cnt". Otherwise we could hit
+ * a zero value in both counters during the transition
+ * time (see _aiodone).
+ *
+ * If both counters are still set to zero, then only
+ * kernel requests are currently outstanding (raw-I/Os).
+ */
+
+ if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
+
+ for (;;) {
+ kwaitcnt = *nwait - dnwait;
+ knentcnt = nent - dnwait;
+ if (knentcnt > AIO_WAITN_MAXIOCBS)
+ knentcnt = AIO_WAITN_MAXIOCBS;
+
+ kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
+
+ err = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
+ &kwaitcnt, wait);
+
+ if (err == 0) {
+ dnwait += kwaitcnt;
+ if (dnwait >= *nwait ||
+ *nwait < AIO_WAITN_MAXIOCBS)
+ break;
+
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ err = _aio_get_timedelta(&end, wait);
+ if (err == -1) {
+ /* timer expired */
+ errno = ETIME;
+ break;
+ }
+ }
+ continue;
+ }
+
+ if (errno == EAGAIN) {
+ if (dnwait > 0)
+ err = 0;
+ break;
+ }
+
+ if (errno == ETIME || errno == EINTR) {
+ dnwait += kwaitcnt;
+ break;
+ }
+
+ /* fatal error */
+ break;
+ }
+
+ *nwait = dnwait;
+
+ /* check for pending aio_waitn() calls */
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags &= ~AIO_LIB_WAITN;
+ if (_aio_flags & AIO_LIB_WAITN_PENDING) {
+ _aio_flags &= ~AIO_LIB_WAITN_PENDING;
+ (void) cond_signal(&_aio_waitn_cv);
+ }
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ return (err);
+ }
+
+ /* File system I/Os outstanding ... */
+
+ if (timedwait == AIO_TIMEOUT_UNDEF) {
+ err = _aio_check_timeout(utimo, &end, &timedwait);
+ if (err) {
+ *nwait = 0;
+ return (-1);
+ }
+
+ if (timedwait != AIO_TIMEOUT_INDEF) {
+ twait = *utimo;
+ wait = &twait;
+ }
+ }
+
+ for (;;) {
+ uint_t sum_reqs;
+
+ /*
+ * Calculate sum of active non RAW-IO requests (sum_reqs).
+ * If the expected amount of completed requests (*nwait) is
+ * greater than the calculated sum (sum_reqs) then
+ * use _kaio to check pending RAW-IO requests.
+ */
+
+ (void) mutex_lock(&__aio_mutex);
+ sum_reqs = _aio_doneq_cnt + dnwait + _aio_outstand_cnt;
+ kwaitcnt = (*nwait > sum_reqs) ? *nwait - sum_reqs : 0;
+ (void) mutex_unlock(&__aio_mutex);
+
+ if (kwaitcnt != 0) {
+
+ /* possibly some kernel I/Os outstanding */
+
+ knentcnt = nent - dnwait;
+ if (knentcnt > AIO_WAITN_MAXIOCBS)
+ knentcnt = AIO_WAITN_MAXIOCBS;
+
+ kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
+
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags |= AIO_WAIT_INPROGRESS;
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ err = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
+ &kwaitcnt, wait);
+
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags &= ~AIO_WAIT_INPROGRESS;
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ if (err == 0) {
+ dnwait += kwaitcnt;
+ } else {
+ switch (errno) {
+ case EINVAL:
+ case EAGAIN:
+ /* don't wait for kernel I/Os */
+ kerrno = 0; /* ignore _kaio() errno */
+ (void) mutex_lock(&__aio_mutex);
+ *nwait = _aio_doneq_cnt +
+ _aio_outstand_cnt + dnwait;
+ (void) mutex_unlock(&__aio_mutex);
+ err = 0;
+ break;
+ case EINTR:
+ case ETIME:
+ /* just scan for completed LIB I/Os */
+ dnwait += kwaitcnt;
+ timedwait = AIO_TIMEOUT_POLL;
+ kerrno = errno; /* save _kaio() errno */
+ err = 0;
+ break;
+ default:
+ kerrno = errno; /* save _kaio() errno */
+ break;
+ }
+ }
+
+ if (err)
+ break; /* fatal kernel error */
+ }
+
+ /* check completed FS requests in the "done" queue */
+
+ (void) mutex_lock(&__aio_mutex);
+ while (_aio_doneq_cnt && (dnwait < nent)) {
+ /* get done requests */
+ if ((aiorp = _aio_req_remove(NULL)) != NULL) {
+ (void) _aio_hash_del(aiorp->req_resultp);
+ list[dnwait++] = aiorp->req_iocb;
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+ if (largefile) {
+ aiop64 = (void *)aiorp->req_iocb;
+ aiop64->aio_state = USERAIO_DONE;
+ } else
+#endif
+ aiorp->req_iocb->aio_state =
+ USERAIO_DONE;
+ _aio_req_free(aiorp);
+ }
+ }
+
+ if (dnwait >= *nwait) {
+ /* min. requested amount of completed I/Os satisfied */
+ (void) mutex_unlock(&__aio_mutex);
+ break;
+ }
+
+ if (timedwait == AIO_TIMEOUT_WAIT) {
+ if ((err = _aio_get_timedelta(&end, wait)) == -1) {
+ /* timer expired */
+ (void) mutex_unlock(&__aio_mutex);
+ uerrno = ETIME;
+ break;
+ }
+ }
+
+ /*
+ * If some I/Os are outstanding and we have to wait for them,
+ * then sleep here.
+ * _aiodone() will wakeup this thread as soon as the
+ * required amount of completed I/Os is done.
+ */
+
+ if (_aio_outstand_cnt > 0 && timedwait != AIO_TIMEOUT_POLL) {
+
+ /*
+ * _aiodone() will wake up this thread as soon as
+ * - _aio_waitncnt -requests are completed or
+ * - _aio_outstand_cnt becomes zero.
+ * cond_reltimedwait() could also return with
+ * timeout error (ETIME).
+ */
+
+ if (*nwait < _aio_outstand_cnt)
+ _aio_waitncnt = *nwait;
+ else
+ _aio_waitncnt = _aio_outstand_cnt;
+
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags |= AIO_IO_WAITING;
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ if (wait)
+ uerrno = cond_reltimedwait(&_aio_iowait_cv,
+ &__aio_mutex, wait);
+ else
+ uerrno = cond_wait(&_aio_iowait_cv,
+ &__aio_mutex);
+
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags &= ~AIO_IO_WAITING;
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ if (uerrno == ETIME) {
+ timedwait = AIO_TIMEOUT_POLL;
+ (void) mutex_unlock(&__aio_mutex);
+ continue;
+ }
+
+ if (uerrno != 0)
+ timedwait = AIO_TIMEOUT_POLL;
+ }
+
+ (void) mutex_unlock(&__aio_mutex);
+ if (timedwait == AIO_TIMEOUT_POLL) {
+ /* polling or timer expired */
+ break;
+ }
+ }
+
+ /* check for pending aio_waitn() calls */
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags &= ~AIO_LIB_WAITN;
+ if (_aio_flags & AIO_LIB_WAITN_PENDING) {
+ _aio_flags &= ~AIO_LIB_WAITN_PENDING;
+ (void) cond_signal(&_aio_waitn_cv);
+ }
+ (void) mutex_unlock(&__aio_waitn_mutex);
+
+ *nwait = dnwait;
+
+ errno = uerrno == 0 ? kerrno : uerrno;
+ if (errno)
+ err = -1;
+ else
+ err = 0;
+
+ return (err);
+}
+
+/*
+ * timedwait values :
+ * AIO_TIMEOUT_POLL : polling
+ * AIO_TIMEOUT_WAIT : timeout
+ * AIO_TIMEOUT_INDEF : wait indefinitely
+ */
+int
+_aio_check_timeout(const struct timespec *utimo, struct timespec *end,
+ int *timedwait)
+{
+ struct timeval curtime;
+
+ if (utimo) {
+ if ((utimo->tv_sec < 0) || (utimo->tv_nsec < 0) ||
+ (utimo->tv_nsec >= NANOSEC)) {
+ /*
+ * invalid timer values => return EINVAL
+ * check for pending aio_waitn() calls
+ */
+ (void) mutex_lock(&__aio_waitn_mutex);
+ _aio_flags &= ~AIO_LIB_WAITN;
+ if (_aio_flags & AIO_LIB_WAITN_PENDING) {
+ _aio_flags &= ~AIO_LIB_WAITN_PENDING;
+ (void) cond_signal(&_aio_waitn_cv);
+ }
+ (void) mutex_unlock(&__aio_waitn_mutex);
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if ((utimo->tv_sec > 0) || (utimo->tv_nsec > 0)) {
+ (void) gettimeofday(&curtime, NULL);
+ end->tv_sec = utimo->tv_sec + curtime.tv_sec;
+ end->tv_nsec = utimo->tv_nsec +
+ 1000 * curtime.tv_usec;
+ if (end->tv_nsec >= NANOSEC) {
+ end->tv_nsec -= NANOSEC;
+ end->tv_sec += 1;
+ }
+ *timedwait = AIO_TIMEOUT_WAIT;
+ } else {
+ /* polling */
+ *timedwait = AIO_TIMEOUT_POLL;
+ }
+ } else {
+ *timedwait = AIO_TIMEOUT_INDEF; /* wait indefinitely */
+ }
+ return (0);
+}
+
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+
+int
+__aio_read64(aiocb64_t *cb)
+{
+ aio_lio_t *head = NULL;
+
+ if (cb == NULL || cb->aio_offset < 0 || cb->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ cb->aio_lio_opcode = LIO_READ;
+ return (_aio_rw64(cb, head, &__nextworker_rd, AIOAREAD64,
+ (AIO_KAIO | AIO_NO_DUPS), NULL));
+}
+
+int
+__aio_write64(aiocb64_t *cb)
+{
+ aio_lio_t *head = NULL;
+
+ if (cb == NULL || cb->aio_offset < 0 || cb->aio_reqprio < 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+ cb->aio_lio_opcode = LIO_WRITE;
+ return (_aio_rw64(cb, head, &__nextworker_wr, AIOAWRITE64,
+ (AIO_KAIO | AIO_NO_DUPS), NULL));
+}
+
+int
+__lio_listio64(int mode, aiocb64_t * const list[],
+ int nent, struct sigevent *sig)
+{
+ int i, err;
+ int aio_ufs = 0;
+ int oerrno = 0;
+ aio_lio_t *head = NULL;
+ int state = 0;
+ static long aio_list_max = 0;
+ aio_worker_t **nextworker;
+ int EIOflg = 0;
+ int rw;
+ int do_kaio = 0;
+
+ if (!_kaio_ok)
+ _kaio_init();
+
+ if (aio_list_max == 0)
+ aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
+
+ if (nent < 0 || nent > aio_list_max) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ switch (mode) {
+ case LIO_WAIT:
+ state = NOCHECK;
+ break;
+ case LIO_NOWAIT:
+ state = CHECK;
+ break;
+ default:
+ errno = EINVAL;
+ return (-1);
+ }
+
+ for (i = 0; i < nent; i++) {
+ if (list[i]) {
+ if (list[i]->aio_lio_opcode != LIO_NOP) {
+ list[i]->aio_state = state;
+ if (KAIO_SUPPORTED(list[i]->aio_fildes))
+ do_kaio++;
+ else
+ list[i]->aio_resultp.aio_errno =
+ ENOTSUP;
+ } else
+ list[i]->aio_state = NOCHECK;
+ }
+ }
+
+ if (do_kaio) {
+ if ((err = (int)_kaio(AIOLIO64, mode, list, nent, sig)) == 0)
+ return (0);
+ oerrno = errno;
+ } else {
+ oerrno = errno = ENOTSUP;
+ err = -1;
+ }
+ if ((err == -1) && (errno == ENOTSUP)) {
+ err = errno = 0;
+ /*
+ * If LIO_WAIT, or signal required, allocate a list head.
+ */
+ if ((mode == LIO_WAIT) ||
+ ((sig) && (sig->sigev_notify == SIGEV_SIGNAL)))
+ (void) _aio_lio_alloc(&head);
+ if (head) {
+ (void) mutex_lock(&head->lio_mutex);
+ head->lio_mode = mode;
+ if ((mode == LIO_NOWAIT) && (sig) &&
+ (sig->sigev_notify != SIGEV_NONE) &&
+ (sig->sigev_signo > 0)) {
+ head->lio_signo = sig->sigev_signo;
+ head->lio_sigval.sival_ptr =
+ sig->sigev_value.sival_ptr;
+ } else
+ head->lio_signo = 0;
+ head->lio_nent = head->lio_refcnt = nent;
+ (void) mutex_unlock(&head->lio_mutex);
+ }
+ /*
+ * find UFS requests, errno == ENOTSUP/EBADFD,
+ */
+ for (i = 0; i < nent; i++) {
+ if (list[i] &&
+ ((list[i]->aio_resultp.aio_errno == ENOTSUP) ||
+ (list[i]->aio_resultp.aio_errno == EBADFD))) {
+ if (list[i]->aio_lio_opcode == LIO_NOP) {
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ if (list[i]->aio_resultp.aio_errno == EBADFD)
+ SET_KAIO_NOT_SUPPORTED(
+ list[i]->aio_fildes);
+ if (list[i]->aio_reqprio < 0) {
+ list[i]->aio_resultp.aio_errno =
+ EINVAL;
+ list[i]->aio_resultp.aio_return = -1;
+ EIOflg = 1;
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ /*
+ * submit an AIO request with flags AIO_NO_KAIO
+ * to avoid the kaio() syscall in _aio_rw()
+ */
+ switch (list[i]->aio_lio_opcode) {
+ case LIO_READ:
+ rw = AIOAREAD64;
+ nextworker = &__nextworker_rd;
+ break;
+ case LIO_WRITE:
+ rw = AIOAWRITE64;
+ nextworker = &__nextworker_wr;
+ break;
+ }
+ if (sig && (sig->sigev_notify == SIGEV_PORT))
+ err = _aio_rw64(list[i], head,
+ nextworker, rw,
+ (AIO_NO_KAIO | AIO_NO_DUPS), sig);
+ else
+ err = _aio_rw64(list[i], head,
+ nextworker, rw,
+ (AIO_NO_KAIO | AIO_NO_DUPS), NULL);
+ if (err != 0) {
+ if (head)
+ _lio_list_decr(head);
+ list[i]->aio_resultp.aio_errno = err;
+ EIOflg = 1;
+ } else
+ aio_ufs++;
+
+ } else {
+ if (head)
+ _lio_list_decr(head);
+ continue;
+ }
+ }
+ }
+ if (EIOflg) {
+ errno = EIO;
+ return (-1);
+ }
+ if ((mode == LIO_WAIT) && (oerrno == ENOTSUP)) {
+ /*
+ * call kaio(AIOLIOWAIT) to get all outstanding
+ * kernel AIO requests
+ */
+ if ((nent - aio_ufs) > 0) {
+ _kaio(AIOLIOWAIT, mode, list, nent, sig);
+ }
+ if (head && head->lio_nent > 0) {
+ (void) mutex_lock(&head->lio_mutex);
+ while (head->lio_refcnt > 0) {
+ errno = cond_wait(&head->lio_cond_cv,
+ &head->lio_mutex);
+ if (errno) {
+ (void) mutex_unlock(&head->lio_mutex);
+ return (-1);
+ }
+ }
+ (void) mutex_unlock(&head->lio_mutex);
+ for (i = 0; i < nent; i++) {
+ if (list[i] &&
+ list[i]->aio_resultp.aio_errno) {
+ errno = EIO;
+ return (-1);
+ }
+ }
+ }
+ return (0);
+ }
+ return (err);
+}
+
+int
+__aio_error64(aiocb64_t *cb)
+{
+ aio_req_t *reqp;
+ int aio_errno = cb->aio_resultp.aio_errno;
+
+ if (aio_errno == EINPROGRESS) {
+ if (cb->aio_state == CHECK) {
+ /*
+ * Always do the kaio() call without using
+ * the KAIO_SUPPORTED()
+ * checks because it is not mandatory to
+ * have a valid fd
+ * set in the aiocb, only the resultp must be set.
+ */
+ if ((_kaio(AIOERROR64, cb)) == EINVAL) {
+ errno = EINVAL;
+ return (-1);
+ }
+ } else if (cb->aio_state == CHECKED)
+ cb->aio_state = CHECK;
+ return (aio_errno);
+ }
+
+ if (cb->aio_state == USERAIO) {
+ _aio_lock();
+ if (reqp = _aio_hash_find(&cb->aio_resultp)) {
+ cb->aio_state = NOCHECK;
+ _lio_remove(reqp->lio_head);
+ (void) _aio_hash_del(reqp->req_resultp);
+ (void) _aio_req_remove(reqp);
+ _aio_req_free(reqp);
+ }
+ _aio_unlock();
+ }
+ return (aio_errno);
+}
+
+ssize_t
+__aio_return64(aiocb64_t *cb)
+{
+ aio_req_t *reqp;
+ int ret;
+
+ /*
+ * graceful detection of an invalid cb is not possible. a
+ * SIGSEGV will be generated if it is invalid.
+ */
+ if (cb == NULL) {
+ errno = EINVAL;
+ exit(-1);
+ }
+ /*
+ * we use this condition to indicate that
+ * aio_return has been called before
+ */
+ if (cb->aio_resultp.aio_return == -1 &&
+ cb->aio_resultp.aio_errno == EINVAL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /*
+ * Before we return mark the result as being returned so that later
+ * calls to aio_return() will return the fact that the result has
+ * already been returned
+ */
+ ret = cb->aio_resultp.aio_return;
+ cb->aio_resultp.aio_return = -1;
+ cb->aio_resultp.aio_errno = EINVAL;
+ if (cb->aio_state == USERAIO) {
+ _aio_lock();
+ if (reqp = _aio_hash_find(&cb->aio_resultp)) {
+ cb->aio_state = NOCHECK;
+ _lio_remove(reqp->lio_head);
+ (void) _aio_hash_del(reqp->req_resultp);
+ (void) _aio_req_remove(reqp);
+ _aio_req_free(reqp);
+ }
+ _aio_unlock();
+ }
+ return (ret);
+}
+
+static int
+__aio_fsync_bar64(aiocb64_t *cb, aio_lio_t *head, aio_worker_t *aiowp,
+ int workerscnt)
+{
+ int i;
+ int err;
+ aio_worker_t *next = aiowp;
+
+ for (i = 0; i < workerscnt; i++) {
+ err = _aio_rw64(cb, head, &next, AIOFSYNC, AIO_NO_KAIO, NULL);
+ if (err != 0) {
+ (void) mutex_lock(&head->lio_mutex);
+ head->lio_mode = LIO_DESTROY; /* ignore fsync */
+ head->lio_nent -= workerscnt - i;
+ head->lio_refcnt -= workerscnt - i;
+ (void) mutex_unlock(&head->lio_mutex);
+ errno = EAGAIN;
+ return (i);
+ }
+ next = next->work_forw;
+ }
+ return (i);
+}
+
+int
+__aio_fsync64(int op, aiocb64_t *cb)
+{
+ struct stat buf;
+ aio_lio_t *head;
+ int retval;
+
+ if (cb == NULL) {
+ return (0);
+ }
+
+ if ((op != O_DSYNC) && (op != O_SYNC)) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if (fstat(cb->aio_fildes, &buf) < 0)
+ return (-1);
+
+ if ((buf.st_mode & S_IWRITE) == 0) {
+ errno = EBADF;
+ return (-1);
+ }
+
+ /*
+ * The first asynchronous I/O request in the current process
+ * will create a bunch of workers.
+ * If the sum of workers (read + write) is zero then the
+ * number of pending asynchronous I/O requests is zero.
+ * In such a case only execute the standard fsync(3C) or
+ * fdatasync(3RT) as appropriate (see flag of __fdsync()).
+ */
+ if ((__wr_workerscnt + __rd_workerscnt) == 0) {
+ if (op == O_DSYNC)
+ return (__fdsync(cb->aio_fildes, FDSYNC));
+ else
+ return (__fdsync(cb->aio_fildes, FSYNC));
+ }
+
+ /*
+ * re-use aio_offset as the op field.
+ * O_DSYNC - fdatasync()
+ * O_SYNC - fsync()
+ */
+ cb->aio_offset = op;
+ cb->aio_lio_opcode = AIOFSYNC;
+
+ /*
+ * create a list of fsync requests. the worker
+ * that gets the last request will do the fsync
+ * request.
+ */
+ (void) _aio_lio_alloc(&head);
+ if (head == NULL) {
+ errno = EAGAIN;
+ return (-1);
+ }
+
+ head->lio_mode = LIO_FSYNC;
+ head->lio_signo = 0;
+ head->lio_nent = head->lio_refcnt = __wr_workerscnt + __rd_workerscnt;
+ /* insert an fsync request on every read workers' queue. */
+ retval = __aio_fsync_bar64(cb, head, __workers_rd, __rd_workerscnt);
+ if (retval != __rd_workerscnt) {
+ /*
+ * Less fsync requests than workers means that
+ * it was not possible to submit fsync requests to all
+ * workers.
+ * Actions:
+ * a) number of fsync requests submitted is 0:
+ * => free allocated memory (aio_lio_t).
+ * b) number of fsync requests submitted is > 0:
+ * => the last worker executing the fsync request
+ * will free the aio_lio_t struct.
+ */
+ if (retval == 0)
+ _aio_lio_free(head);
+ return (-1);
+ }
+
+ /* insert an fsync request on every write workers' queue. */
+ retval = __aio_fsync_bar64(cb, head, __workers_wr, __wr_workerscnt);
+ if (retval != __wr_workerscnt)
+ return (-1);
+ return (0);
+}
+
+int
+__aio_cancel64(int fd, aiocb64_t *cb)
+{
+ aio_req_t *rp;
+ aio_worker_t *aiowp;
+ int done = 0;
+ int canceled = 0;
+ struct stat buf;
+
+ if (fstat(fd, &buf) < 0)
+ return (-1);
+
+ if (cb != NULL) {
+ if (cb->aio_state == USERAIO) {
+ _aio_lock();
+ rp = _aio_hash_find(&cb->aio_resultp);
+ if (rp == NULL) {
+ _aio_unlock();
+ return (AIO_ALLDONE);
+ } else {
+ aiowp = rp->req_worker;
+ (void) mutex_lock(&aiowp->work_qlock1);
+ (void) _aio_cancel_req(aiowp, rp, &canceled,
+ &done);
+ (void) mutex_unlock(&aiowp->work_qlock1);
+ _aio_unlock();
+ if (done)
+ return (AIO_ALLDONE);
+ else if (canceled)
+ return (AIO_CANCELED);
+ else
+ return (AIO_NOTCANCELED);
+ }
+ }
+ return ((int)_kaio(AIOCANCEL, fd, cb));
+ }
+
+ return (aiocancel_all(fd));
+}
+
+#endif /* (_LARGEFILE64_SOURCE) && !defined(_LP64) */
diff --git a/usr/src/lib/libaio/common/scalls.c b/usr/src/lib/libaio/common/scalls.c
new file mode 100644
index 0000000000..f874a2b7cc
--- /dev/null
+++ b/usr/src/lib/libaio/common/scalls.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "libaio.h"
+
+extern int __uaio_ok;
+extern void _cancelon(void);
+extern void _canceloff(void);
+
+#pragma weak close = _libaio_close
+int
+_libaio_close(int fd)
+{
+ int rc;
+
+ if (__uaio_ok)
+ (void) aiocancel_all(fd);
+
+ _cancelon();
+ rc = _close(fd);
+ _canceloff();
+
+ /*
+ * If the file is successfully closed, clear the
+ * bit for this file, as the next open may re-use this
+ * file descriptor, and the new file may have
+ * different kaio() behaviour
+ */
+ if (rc == 0)
+ CLEAR_KAIO_SUPPORTED(fd);
+
+ return (rc);
+
+}
+
+#pragma weak fork = _libaio_fork
+pid_t
+_libaio_fork(void)
+{
+ pid_t pid;
+
+ if (__uaio_ok || _kaio_ok) {
+ pid = fork1();
+ if (pid == 0)
+ _aio_forkinit();
+ return (pid);
+ }
+ return (_fork());
+}
diff --git a/usr/src/lib/libaio/common/sig.c b/usr/src/lib/libaio/common/sig.c
new file mode 100644
index 0000000000..3f26b21f11
--- /dev/null
+++ b/usr/src/lib/libaio/common/sig.c
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "libaio.h"
+#include <dlfcn.h>
+
+mutex_t __sigio_pendinglock = DEFAULTMUTEX; /* protects __sigio_pending */
+int __sigio_pending = 0; /* count of pending SIGIO signals */
+int _sigio_enabled = 0; /* set if SIGIO has a signal handler */
+static struct sigaction sigioact;
+sigset_t __sigiomask;
+struct sigaction sigcanact;
+
+typedef int (*sig_act_t)(int, const struct sigaction *, struct sigaction *);
+static sig_act_t next_sigaction;
+
+int
+_aio_create_worker(aio_req_t *rp, int mode)
+{
+ struct aio_worker *aiowp, **workers, **nextworker;
+ int *aio_workerscnt;
+ void *(*func)(void *);
+ sigset_t oset;
+ int error;
+
+ /*
+ * Put the new worker thread in the right queue.
+ */
+ switch (mode) {
+ case AIOWRITE:
+ workers = &__workers_wr;
+ nextworker = &__nextworker_wr;
+ aio_workerscnt = &__wr_workerscnt;
+ func = _aio_do_request;
+ break;
+ case AIOREAD:
+ workers = &__workers_rd;
+ nextworker = &__nextworker_rd;
+ aio_workerscnt = &__rd_workerscnt;
+ func = _aio_do_request;
+ break;
+ case AIOSIGEV:
+ workers = &__workers_si;
+ nextworker = &__nextworker_si;
+ func = _aio_send_sigev;
+ aio_workerscnt = &__si_workerscnt;
+ }
+
+ if ((aiowp = _aio_alloc_worker()) == NULL)
+ return (-1);
+
+ if (rp) {
+ rp->req_state = AIO_REQ_QUEUED;
+ rp->req_worker = aiowp;
+ aiowp->work_head1 = rp;
+ aiowp->work_tail1 = rp;
+ aiowp->work_next1 = rp;
+ aiowp->work_cnt1 = 1;
+ }
+
+ (void) _sigprocmask(SIG_SETMASK, &_worker_set, &oset);
+ error = thr_create(NULL, __aiostksz, func, aiowp,
+ THR_BOUND | THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
+ (void) _sigprocmask(SIG_SETMASK, &oset, NULL);
+ if (error) {
+ if (rp) {
+ rp->req_state = AIO_REQ_FREE;
+ rp->req_worker = NULL;
+ }
+ _aio_free_worker(aiowp);
+ return (-1);
+ }
+
+ (void) mutex_lock(&__aio_mutex);
+ (*aio_workerscnt)++;
+ if (*workers == NULL) {
+ aiowp->work_forw = aiowp;
+ aiowp->work_backw = aiowp;
+ *nextworker = aiowp;
+ *workers = aiowp;
+ } else {
+ aiowp->work_backw = (*workers)->work_backw;
+ aiowp->work_forw = (*workers);
+ (*workers)->work_backw->work_forw = aiowp;
+ (*workers)->work_backw = aiowp;
+ }
+ _aio_worker_cnt++;
+ (void) mutex_unlock(&__aio_mutex);
+
+ (void) thr_continue(aiowp->work_tid);
+
+ return (0);
+}
+
+void
+_aio_cancel_on(struct aio_worker *aiowp)
+{
+ aiowp->work_cancel_flg = 1;
+}
+
+void
+_aio_cancel_off(struct aio_worker *aiowp)
+{
+ aiowp->work_cancel_flg = 0;
+}
+
+/*
+ * resend a SIGIO signal that was sent while the
+ * __aio_mutex was locked.
+ *
+ * This function is called from _aio_unlock() when previously SIGIO was
+ * detected and deferred (signal caught).
+ * There could be several threads calling _aio_lock() - _aio_unlock() and
+ * therefore __aiosendsig() must make sure that "kill" is being called
+ * only one time here.
+ *
+ */
+void
+__aiosendsig(void)
+{
+ sigset_t oset;
+ int send_sigio;
+
+ (void) _sigprocmask(SIG_BLOCK, &__sigiomask, &oset);
+
+ (void) mutex_lock(&__sigio_pendinglock);
+ send_sigio = __sigio_pending;
+ __sigio_pending = 0;
+ (void) mutex_unlock(&__sigio_pendinglock);
+
+ (void) _sigprocmask(SIG_SETMASK, &oset, NULL);
+
+ if (__pid == (pid_t)-1)
+ __pid = getpid();
+ if (send_sigio)
+ (void) kill(__pid, SIGIO);
+}
+
+/*
+ * this is the low-level handler for SIGIO. the application
+ * handler will not be called if the signal is being blocked.
+ */
+static void
+aiosigiohndlr(int sig, siginfo_t *sip, void *uap)
+{
+ struct sigaction tact;
+ int blocked;
+
+ /*
+ * SIGIO signal is being blocked if either _sigio_masked
+ * or sigio_maskedcnt is set or if both these variables
+ * are clear and the _aio_mutex is locked. the last
+ * condition can only happen when _aio_mutex is being
+ * unlocked. this is a very small window where the mask
+ * is clear and the lock is about to be unlocked, however,
+ * it`s still set and so the signal should be defered.
+ * mutex_trylock() will be used now to check the ownership
+ * of the lock (instead of MUTEX_HELD). This is necessary because
+ * there is a window where the owner of the lock is deleted
+ * and the thread could become preempted. In that case MUTEX_HELD()
+ * will not detect the -still- ownership of the lock.
+ */
+ if ((blocked = (__sigio_masked | __sigio_maskedcnt)) == 0) {
+ if (mutex_trylock(&__aio_mutex) == 0)
+ (void) mutex_unlock(&__aio_mutex);
+ else
+ blocked = 1;
+ }
+
+ if (blocked) {
+ /*
+ * aio_lock() is supposed to be non re-entrant with
+ * respect to SIGIO signals. if a SIGIO signal
+ * interrupts a region of code locked by _aio_mutex
+ * the SIGIO signal should be deferred until this
+ * mutex is unlocked. a flag is set, sigio_pending,
+ * to indicate that a SIGIO signal is pending and
+ * should be resent to the process via a kill().
+ * The libaio handler must be reinstalled here, otherwise
+ * the disposition gets the default status and the
+ * next SIGIO signal would terminate the process.
+ */
+ (void) mutex_lock(&__sigio_pendinglock);
+ __sigio_pending = 1;
+ (void) mutex_unlock(&__sigio_pendinglock);
+ tact = sigioact;
+ tact.sa_sigaction = aiosigiohndlr;
+ (void) sigaddset(&tact.sa_mask, SIGIO);
+ (void) (*next_sigaction)(SIGIO, &tact, NULL);
+ } else {
+ /*
+ * call the real handler.
+ */
+ (sigioact.sa_sigaction)(sig, sip, uap);
+ }
+}
+
+void
+aiosigcancelhndlr(int sig, siginfo_t *sip, void *uap)
+{
+ struct aio_worker *aiowp;
+ struct sigaction act;
+
+ if (sip != NULL && sip->si_code == SI_LWP) {
+ if (thr_getspecific(_aio_key, (void **)&aiowp) != 0)
+ _aiopanic("aiosigcancelhndlr, thr_getspecific()\n");
+ ASSERT(aiowp != NULL);
+ if (aiowp->work_cancel_flg)
+ siglongjmp(aiowp->work_jmp_buf, 1);
+ } else if (sigcanact.sa_handler == SIG_DFL) {
+ act.sa_handler = SIG_DFL;
+ (void) (*next_sigaction)(SIGAIOCANCEL, &act, NULL);
+ (void) kill(getpid(), sig);
+ } else if (sigcanact.sa_handler != SIG_IGN) {
+ (sigcanact.sa_sigaction)(sig, sip, uap);
+ }
+}
+
+#pragma weak sigaction = _sigaction
+int
+_sigaction(int sig, const struct sigaction *nact, struct sigaction *oact)
+{
+ struct sigaction tact;
+ struct sigaction oldact;
+
+ if (next_sigaction == NULL)
+ next_sigaction = (sig_act_t)dlsym(RTLD_NEXT, "_sigaction");
+
+ /*
+ * Only interpose on SIGIO when it is given a disposition other
+ * than SIG_IGN, or SIG_DFL. Because SIGAIOCANCEL is SIGPROF,
+ * this signal always should be interposed on, so that SIGPROF
+ * can also be used by the application for profiling.
+ */
+ if (sig == SIGIO || sig == SIGAIOCANCEL) {
+ if (oact) {
+ if (sig == SIGIO)
+ *oact = sigioact;
+ else
+ *oact = sigcanact;
+ }
+ if (nact == NULL)
+ return (0);
+
+ tact = *nact;
+ if (sig == SIGIO) {
+ oldact = sigioact;
+ sigioact = tact;
+ if (tact.sa_handler == SIG_DFL ||
+ tact.sa_handler == SIG_IGN) {
+ _sigio_enabled = 0;
+ } else {
+ _sigio_enabled = 1;
+ tact.sa_sigaction = aiosigiohndlr;
+ }
+ tact.sa_flags &= ~SA_NODEFER;
+ if ((*next_sigaction)(sig, &tact, NULL) == -1) {
+ sigioact = oldact;
+ return (-1);
+ }
+ } else {
+ oldact = sigcanact;
+ sigcanact = tact;
+ tact.sa_sigaction = aiosigcancelhndlr;
+ tact.sa_flags &= ~SA_NODEFER;
+ tact.sa_flags |= SA_SIGINFO;
+ if ((*next_sigaction)(sig, &tact, NULL) == -1) {
+ sigcanact = oldact;
+ return (-1);
+ }
+ }
+ return (0);
+ }
+
+ return ((*next_sigaction)(sig, nact, oact));
+}
diff --git a/usr/src/lib/libaio/common/subr.c b/usr/src/lib/libaio/common/subr.c
new file mode 100644
index 0000000000..e3661f7cd2
--- /dev/null
+++ b/usr/src/lib/libaio/common/subr.c
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include "libaio.h"
+
+static void
+_halt(void)
+{
+ (void) pause();
+}
+
+int _halted = 0;
+
+void
+_aiopanic(char *s)
+{
+ char buf[256];
+
+ _halted = 1;
+ (void) snprintf(buf, sizeof (buf),
+ "AIO PANIC (thread = %d): %s\n", thr_self(), s);
+ (void) write(2, buf, strlen(buf));
+ _halt();
+}
+
+int
+assfail(char *a, char *f, int l)
+{
+ char buf[256];
+
+ (void) snprintf(buf, sizeof (buf),
+ "assertion failed: %s, file: %s, line:%d", a, f, l);
+ _aiopanic(buf);
+ /*NOTREACHED*/
+ return (0);
+}
diff --git a/usr/src/lib/libaio/i386/Makefile b/usr/src/lib/libaio/i386/Makefile
new file mode 100644
index 0000000000..af76f5ab90
--- /dev/null
+++ b/usr/src/lib/libaio/i386/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libaio/sparc/Makefile b/usr/src/lib/libaio/sparc/Makefile
new file mode 100644
index 0000000000..af76f5ab90
--- /dev/null
+++ b/usr/src/lib/libaio/sparc/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libaio/sparcv9/Makefile b/usr/src/lib/libaio/sparcv9/Makefile
new file mode 100644
index 0000000000..cb39a2beff
--- /dev/null
+++ b/usr/src/lib/libaio/sparcv9/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libaio/spec/Makefile b/usr/src/lib/libaio/spec/Makefile
new file mode 100644
index 0000000000..1ab4a810d6
--- /dev/null
+++ b/usr/src/lib/libaio/spec/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# Copyright (c) 1998-1999 by Sun Microsystems, Inc.
+# All rights reserved.
+#
+# lib/libaio/spec/Makefile
+
+include $(SRC)/lib/Makefile.spec.arch
diff --git a/usr/src/lib/libaio/spec/Makefile.targ b/usr/src/lib/libaio/spec/Makefile.targ
new file mode 100644
index 0000000000..5fd6ef49cf
--- /dev/null
+++ b/usr/src/lib/libaio/spec/Makefile.targ
@@ -0,0 +1,37 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# Copyright (c) 1998-1999 by Sun Microsystems, Inc.
+# All rights reserved.
+#
+# lib/libaio/spec/Makefile.targ
+
+
+.KEEP_STATE:
+
+LIBRARY = libaio.a
+VERS = .1
+
+OBJECTS = aio.o
+
diff --git a/usr/src/lib/libaio/spec/aio.spec b/usr/src/lib/libaio/spec/aio.spec
new file mode 100644
index 0000000000..99f0401f93
--- /dev/null
+++ b/usr/src/lib/libaio/spec/aio.spec
@@ -0,0 +1,208 @@
+#
+# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+function aiocancel
+include <sys/asynch.h>, <aio.h>
+declaration int aiocancel(aio_result_t *resultp)
+version sparc=SISCD_2.3 sparcv9=SUNW_0.7 i386=SUNW_0.7 amd64=SUNW_0.7
+errno EACCES EFAULT EINVAL
+exception $return == -1
+end
+
+function aioread
+include <sys/types.h>, <sys/asynch.h>, <aio.h>
+declaration int aioread(int fildes, char *bufp, int bufs, \
+ off_t offset, int whence, aio_result_t *resultp)
+version sparc=SISCD_2.3 sparcv9=SUNW_0.7 i386=SUNW_0.7 amd64=SUNW_0.7
+errno EAGAIN EBADF EFAULT EINVAL ENOMEM
+exception $return == -1
+end
+
+function aioread64
+declaration int aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, \
+ int whence, aio_result_t *resultp)
+arch i386 sparc
+version i386=SUNW_1.0 sparc=SUNW_1.0
+end
+
+function aiowait
+include <sys/asynch.h>, <aio.h>, <sys/time.h>
+declaration aio_result_t *aiowait(struct timeval *timeout)
+version sparc=SISCD_2.3 sparcv9=SUNW_0.7 i386=SUNW_0.7 amd64=SUNW_0.7
+errno EFAULT EINTR EINVAL
+exception $return == (aio_result_t *)-1
+end
+
+function aiowrite
+include <sys/types.h>, <sys/asynch.h>, <aio.h>
+declaration int aiowrite(int fildes, char *bufp, int bufs, \
+ off_t offset, int whence, aio_result_t *resultp)
+version sparc=SISCD_2.3 sparcv9=SUNW_0.7 i386=SUNW_0.7 amd64=SUNW_0.7
+errno EAGAIN EBADF EFAULT EINVAL ENOMEM
+exception $return == -1
+end
+
+function aiowrite64
+include <sys/types.h>, <sys/asynch.h>, <aio.h>
+declaration int aiowrite64(int fildes, char *bufp, int bufs, \
+ off64_t offset, int whence, aio_result_t *resultp)
+arch sparc i386
+version sparc=SUNW_1.0 i386=SUNW_1.0
+errno EAGAIN EBADF EFAULT EINVAL ENOMEM
+exception $return == -1
+end
+
+function assfail
+declaration int assfail(char *a, char *f, int l)
+version SUNW_1.1
+end
+
+function close
+include <unistd.h>
+declaration int close(int fildes)
+version SUNW_0.7
+errno EBADF EINTR ENOLINK EIO
+exception $return == -1
+binding nodirect
+end
+
+function fork
+declaration pid_t fork(void)
+version SUNW_0.7
+exception $return == -1
+binding nodirect
+end
+
+function sigaction extends libc/spec/sys.spec sigaction
+version SUNW_0.7
+binding nodirect
+end
+
+function _sigaction
+weak sigaction
+version SUNWprivate_1.1
+binding nodirect
+end
+
+function __lio_listio
+declaration int __lio_listio(int mode, aiocb_t * const list[], int nent, \
+ struct sigevent *sig)
+version SUNWprivate_1.1
+end
+
+function __aio_suspend
+declaration int __aio_suspend(void **list, int nent, \
+ const timespec_t *timo, int largefile)
+version SUNWprivate_1.1
+end
+
+function __aio_error
+declaration int __aio_error(aiocb_t *cb)
+version SUNWprivate_1.1
+end
+
+function __aio_return
+declaration ssize_t __aio_return(aiocb_t *cb)
+version SUNWprivate_1.1
+end
+
+function __aio_read
+declaration int __aio_read(aiocb_t *cb)
+version SUNWprivate_1.1
+end
+
+function __aio_write
+declaration int __aio_write(aiocb_t *cb)
+version SUNWprivate_1.1
+end
+
+function __aio_fsync
+declaration int __aio_fsync(int op, aiocb_t *aiocbp)
+version SUNWprivate_1.1
+end
+
+function __aio_cancel
+declaration int __aio_cancel(int fd, aiocb_t *aiocbp)
+version SUNWprivate_1.1
+end
+
+function __aio_waitn
+declaration int __aio_waitn(void **list, uint_t nent, uint_t *nwait, \
+ const struct timespec *timeout, int mode)
+version SUNWprivate_1.1
+end
+
+function __lio_listio64
+declaration int __lio_listio64(int mode, aiocb64_t * const list[], \
+ int nent, struct sigevent *sig)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function __aio_error64
+declaration int __aio_error64(aiocb64_t *cb)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function __aio_return64
+declaration ssize_t __aio_return64(aiocb64_t *cb)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function __aio_read64
+declaration int __aio_read64(aiocb64_t *cb)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function __aio_write64
+declaration int __aio_write64(aiocb64_t *cb)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function __aio_fsync64
+declaration int __aio_fsync64(int op, aiocb64_t *aiocbp)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function __aio_cancel64
+declaration int __aio_cancel64(int fd, aiocb64_t *aiocbp)
+arch sparc i386
+version sparc=SUNWprivate_1.1 i386=SUNWprivate_1.1
+end
+
+function _libaio_close
+version SUNWprivate_1.1
+end
+
+function _libaio_fork
+version SUNWprivate_1.1
+end
diff --git a/usr/src/lib/libaio/spec/amd64/Makefile b/usr/src/lib/libaio/spec/amd64/Makefile
new file mode 100644
index 0000000000..d334868181
--- /dev/null
+++ b/usr/src/lib/libaio/spec/amd64/Makefile
@@ -0,0 +1,44 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+
+# Add arch specific objects here
+OBJECTS +=
+
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.lib.64
+
+# Uncomment the following if the linker complains
+#amd64_C_PICFLAGS = $(amd64_C_BIGPICFLAGS)
+
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB64)
diff --git a/usr/src/lib/libaio/spec/i386/Makefile b/usr/src/lib/libaio/spec/i386/Makefile
new file mode 100644
index 0000000000..19268499ce
--- /dev/null
+++ b/usr/src/lib/libaio/spec/i386/Makefile
@@ -0,0 +1,44 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# Copyright (c) 1998-1999 by Sun Microsystems, Inc.
+# All rights reserved.
+#
+# lib/libaio/spec/i386/Makefile
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+
+# Add arch specific objects here
+OBJECTS +=
+
+include $(SRC)/lib/Makefile.lib
+
+# Uncomment the following if the linker complains
+#i386_C_PICFLAGS = -K PIC
+
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB)
diff --git a/usr/src/lib/libaio/spec/sparc/Makefile b/usr/src/lib/libaio/spec/sparc/Makefile
new file mode 100644
index 0000000000..9f95f97dc7
--- /dev/null
+++ b/usr/src/lib/libaio/spec/sparc/Makefile
@@ -0,0 +1,44 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# Copyright (c) 1998-1999 by Sun Microsystems, Inc.
+# All rights reserved.
+#
+# lib/libaio/spec/sparc/Makefile
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+
+# Add arch specific objects here
+OBJECTS +=
+
+include $(SRC)/lib/Makefile.lib
+
+# Uncomment the following if the linker complains
+#sparc_C_PICFLAGS = -K PIC
+
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB)
diff --git a/usr/src/lib/libaio/spec/sparcv9/Makefile b/usr/src/lib/libaio/spec/sparcv9/Makefile
new file mode 100644
index 0000000000..a90d93da31
--- /dev/null
+++ b/usr/src/lib/libaio/spec/sparcv9/Makefile
@@ -0,0 +1,45 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# Copyright (c) 1998-1999 by Sun Microsystems, Inc.
+# All rights reserved.
+#
+# lib/libaio/spec/sparcv9/Makefile
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+
+# Add arch specific objects here
+OBJECTS +=
+
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.lib.64
+
+# Uncomment the following if the linker complains
+#sparcv9_C_PICFLAGS = -K PIC
+
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB64)
diff --git a/usr/src/lib/libaio/spec/versions b/usr/src/lib/libaio/spec/versions
new file mode 100644
index 0000000000..fd8cb78ec8
--- /dev/null
+++ b/usr/src/lib/libaio/spec/versions
@@ -0,0 +1,55 @@
+#
+# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+i386 {
+ SUNW_1.1: {SUNW_1.0};
+ SUNW_1.0: {SUNW_0.7};
+ SUNW_0.7;
+ SUNWprivate_1.1;
+}
+
+sparc {
+ SUNW_1.1: {SUNW_1.0};
+ SUNW_1.0: {SUNW_0.7};
+ SUNW_0.7: {SISCD_2.3};
+ SISCD_2.3;
+ SUNWprivate_1.1;
+}
+
+sparcv9 {
+ SUNW_1.1: {SUNW_1.0};
+ SUNW_1.0: {SUNW_0.7};
+ SUNW_1.0;
+ SUNWprivate_1.1;
+}
+
+amd64 {
+ SUNW_1.1: {SUNW_1.0};
+ SUNW_1.0: {SUNW_0.7};
+ SUNW_1.0;
+ SUNWprivate_1.1;
+}