diff options
| author | jg <none@none> | 2006-09-25 08:38:36 -0700 |
|---|---|---|
| committer | jg <none@none> | 2006-09-25 08:38:36 -0700 |
| commit | 83c4dfe9546fd839e7a52bca7e9920da918f916e (patch) | |
| tree | b20d087d1ba86d49a6059c9bf59daeda3cc69db0 /usr/src/uts/common/os/devcache.c | |
| parent | f92daba9919b6e68875ccdc9a5532cadf37959f1 (diff) | |
| download | illumos-joyent-83c4dfe9546fd839e7a52bca7e9920da918f916e.tar.gz | |
6466789 decouple devices cache management implementation from consumers
--HG--
rename : usr/src/uts/common/os/devctl.c => usr/src/uts/common/os/devcache.c
rename : usr/src/uts/common/sys/devctl_impl.h => usr/src/uts/common/sys/devcache_impl.h
Diffstat (limited to 'usr/src/uts/common/os/devcache.c')
| -rw-r--r-- | usr/src/uts/common/os/devcache.c | 1146 |
1 files changed, 1146 insertions, 0 deletions
diff --git a/usr/src/uts/common/os/devcache.c b/usr/src/uts/common/os/devcache.c new file mode 100644 index 0000000000..14cde49faf --- /dev/null +++ b/usr/src/uts/common/os/devcache.c @@ -0,0 +1,1146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/note.h> +#include <sys/t_lock.h> +#include <sys/cmn_err.h> +#include <sys/instance.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/hwconf.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/ddi_impldefs.h> +#include <sys/ndi_impldefs.h> +#include <sys/modctl.h> +#include <sys/dacf.h> +#include <sys/promif.h> +#include <sys/cpuvar.h> +#include <sys/pathname.h> +#include <sys/kobj.h> +#include <sys/devcache.h> +#include <sys/devcache_impl.h> +#include <sys/sysmacros.h> +#include <sys/varargs.h> +#include <sys/callb.h> + +/* + * This facility provides interfaces to clients to register, + * read and update cache data in persisted backing store files, + * usually in /etc/devices. The data persisted through this + * mechanism should be stateless data, functioning in the sense + * of a cache. Writes are performed by a background daemon + * thread, permitting a client to schedule an update without + * blocking, then continue updating the data state in + * parallel. The data is only locked by the daemon thread + * to pack the data in preparation for the write. + * + * Data persisted through this mechanism should be capable + * of being regenerated through normal system operation, + * for example attaching all disk devices would cause all + * devids to be registered for those devices. By caching + * a devid-device tuple, the system can operate in a + * more optimal way, directly attaching the device mapped + * to a devid, rather than burdensomely driving attach of + * the entire device tree to discover a single device. + * + * Note that a client should only need to include + * <sys/devcache.h> for the supported interfaces. + * + * The data per client is entirely within the control of + * the client. When reading, data unpacked from the backing + * store should be inserted in the list. The pointer to + * the list can be retreived via nvf_list(). When writing, + * the data on the list is to be packed and returned to the + * nvpdaemon as an nvlist. + * + * Obvious restrictions are imposed by the limits of the + * nvlist format. The data cannot be read or written + * piecemeal, and large amounts of data aren't recommended. + * However, nvlists do allow that data be named and typed + * and can be size-of-int invariant, and the cached data + * can be versioned conveniently. + * + * The registration involves two steps: a handle is + * allocated by calling the registration function. + * This sets up the data referenced by the handle and + * initializes the lock. Following registration, the + * client must initialize the data list. The list + * interfaces require that the list element with offset + * to the node link be provided. The format of the + * list element is under the control of the client. + * + * Locking: the address of the data list r/w lock provided + * can be accessed with nvf_lock(). The lock must be held + * as reader when traversing the list or checking state, + * such as nvf_is_dirty(). The lock must be held as + * writer when updating the list or marking it dirty. + * The lock must not be held when waking the daemon. + * + * The data r/w lock is held as writer when the pack, + * unpack and free list handlers are called. The + * lock should not be dropped and must be still held + * upon return. The client should also hold the lock + * as reader when checking if the list is dirty, and + * as writer when marking the list dirty or initiating + * a read. + * + * The asynchronous nature of updates allows for the + * possibility that the data may continue to be updated + * once the daemon has been notified that an update is + * desired. The data only needs to be locked against + * updates when packing the data into the form to be + * written. When the write of the packed data has + * completed, the daemon will automatically reschedule + * an update if the data was marked dirty after the + * point at which it was packed. Before beginning an + * update, the daemon attempts to lock the data as + * writer; if the writer lock is already held, it + * backs off and retries later. The model is to give + * priority to the kernel processes generating the + * data, and that the nature of the data is that + * it does not change often, can be re-generated when + * needed, so updates should not happen often and + * can be delayed until the data stops changing. + * The client may update the list or mark it dirty + * any time it is able to acquire the lock as + * writer first. + * + * A failed write will be retried after some delay, + * in the hope that the cause of the error will be + * transient, for example a filesystem with no space + * available. An update on a read-only filesystem + * is failed silently and not retried; this would be + * the case when booted off install media. + * + * There is no unregister mechanism as of yet, as it + * hasn't been needed so far. + */ + +/* + * Global list of files registered and updated by the nvpflush + * daemon, protected by the nvf_cache_mutex. While an + * update is taking place, a file is temporarily moved to + * the dirty list to avoid locking the primary list for + * the duration of the update. + */ +list_t nvf_cache_files; +list_t nvf_dirty_files; +kmutex_t nvf_cache_mutex; + + +/* + * Allow some delay from an update of the data before flushing + * to permit simultaneous updates of multiple changes. + * Changes in the data are expected to be bursty, ie + * reconfig or hot-plug of a new adapter. + * + * kfio_report_error (default 0) + * Set to 1 to enable some error messages related to low-level + * kernel file i/o operations. + * + * nvpflush_delay (default 10) + * The number of seconds after data is marked dirty before the + * flush daemon is triggered to flush the data. A longer period + * of time permits more data updates per write. Note that + * every update resets the timer so no repository write will + * occur while data is being updated continuously. + * + * nvpdaemon_idle_time (default 60) + * The number of seconds the daemon will sleep idle before exiting. + * + */ +#define NVPFLUSH_DELAY 10 +#define NVPDAEMON_IDLE_TIME 60 + +#define TICKS_PER_SECOND (drv_usectohz(1000000)) + +/* + * Tunables + */ +int kfio_report_error = 0; /* kernel file i/o operations */ +int kfio_disable_read = 0; /* disable all reads */ +int kfio_disable_write = 0; /* disable all writes */ + +int nvpflush_delay = NVPFLUSH_DELAY; +int nvpdaemon_idle_time = NVPDAEMON_IDLE_TIME; + +static timeout_id_t nvpflush_id = 0; +static int nvpflush_timer_busy = 0; +static int nvpflush_daemon_active = 0; +static kthread_t *nvpflush_thr_id = 0; + +static int do_nvpflush = 0; +static int nvpbusy = 0; +static kmutex_t nvpflush_lock; +static kcondvar_t nvpflush_cv; +static kthread_id_t nvpflush_thread; +static clock_t nvpticks; + +static void nvpflush_daemon(void); + +#ifdef DEBUG +int nvpdaemon_debug = 0; +int kfio_debug = 0; +#endif /* DEBUG */ + +extern int modrootloaded; +extern void mdi_read_devices_files(void); +extern void mdi_clean_vhcache(void); + +/* + * Initialize the overall cache file management + */ +void +i_ddi_devices_init(void) +{ + list_create(&nvf_cache_files, sizeof (nvfd_t), + offsetof(nvfd_t, nvf_link)); + list_create(&nvf_dirty_files, sizeof (nvfd_t), + offsetof(nvfd_t, nvf_link)); + mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL); + devid_cache_init(); +} + +/* + * Read cache files + * The files read here should be restricted to those + * that may be required to mount root. + */ +void +i_ddi_read_devices_files(void) +{ + if (!kfio_disable_read) { + mdi_read_devices_files(); + devid_cache_read(); + } +} + +void +i_ddi_start_flush_daemon(void) +{ + nvfd_t *nvfdp; + + ASSERT(i_ddi_io_initialized()); + + mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL); + + mutex_enter(&nvf_cache_mutex); + for (nvfdp = list_head(&nvf_cache_files); nvfdp; + nvfdp = list_next(&nvf_cache_files, nvfdp)) { + if (NVF_IS_DIRTY(nvfdp)) { + nvf_wake_daemon(); + break; + } + } + mutex_exit(&nvf_cache_mutex); +} + +void +i_ddi_clean_devices_files(void) +{ + devid_cache_cleanup(); + mdi_clean_vhcache(); +} + +/* + * Register a cache file to be managed and updated by the nvpflush daemon. + * All operations are performed through the returned handle. + * There is no unregister mechanism for now. + */ +nvf_handle_t +nvf_register_file(nvf_ops_t *ops) +{ + nvfd_t *nvfdp; + + nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP); + + nvfdp->nvf_ops = ops; + nvfdp->nvf_flags = 0; + rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL); + + mutex_enter(&nvf_cache_mutex); + list_insert_tail(&nvf_cache_files, nvfdp); + mutex_exit(&nvf_cache_mutex); + + return ((nvf_handle_t)nvfdp); +} + +/*PRINTFLIKE1*/ +void +nvf_error(const char *fmt, ...) +{ + va_list ap; + + if (kfio_report_error) { + va_start(ap, fmt); + vcmn_err(CE_NOTE, fmt, ap); + va_end(ap); + } +} + +/* + * Some operations clients may use to manage the data + * to be persisted in a cache file. + */ +char * +nvf_cache_name(nvf_handle_t handle) +{ + return (((nvfd_t *)handle)->nvf_cache_path); +} + +krwlock_t * +nvf_lock(nvf_handle_t handle) +{ + return (&(((nvfd_t *)handle)->nvf_lock)); +} + +list_t * +nvf_list(nvf_handle_t handle) +{ + return (&(((nvfd_t *)handle)->nvf_data_list)); +} + +void +nvf_mark_dirty(nvf_handle_t handle) +{ + ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock))); + NVF_MARK_DIRTY((nvfd_t *)handle); +} + +int +nvf_is_dirty(nvf_handle_t handle) +{ + ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock))); + return (NVF_IS_DIRTY((nvfd_t *)handle)); +} + +static uint16_t +nvp_cksum(uchar_t *buf, int64_t buflen) +{ + uint16_t cksum = 0; + uint16_t *p = (uint16_t *)buf; + int64_t n; + + if ((buflen & 0x01) != 0) { + buflen--; + cksum = buf[buflen]; + } + n = buflen / 2; + while (n-- > 0) + cksum ^= *p++; + return (cksum); +} + +int +fread_nvlist(char *filename, nvlist_t **ret_nvlist) +{ + struct _buf *file; + nvpf_hdr_t hdr; + char *buf; + nvlist_t *nvl; + int rval; + uint_t offset; + int n; + char c; + uint16_t cksum, hdrsum; + + *ret_nvlist = NULL; + + file = kobj_open_file(filename); + if (file == (struct _buf *)-1) { + KFDEBUG((CE_CONT, "cannot open file: %s\n", filename)); + return (ENOENT); + } + + offset = 0; + n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset); + if (n != sizeof (hdr)) { + kobj_close_file(file); + if (n < 0) { + nvf_error("error reading header: %s\n", filename); + return (EIO); + } else if (n == 0) { + KFDEBUG((CE_CONT, "file empty: %s\n", filename)); + } else { + nvf_error("header size incorrect: %s\n", filename); + } + return (EINVAL); + } + offset += n; + + KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic)); + KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version)); + KFDEBUG2((CE_CONT, "nvpf_size: %lld\n", + (longlong_t)hdr.nvpf_size)); + KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n", + hdr.nvpf_hdr_chksum)); + KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum)); + + cksum = hdr.nvpf_hdr_chksum; + hdr.nvpf_hdr_chksum = 0; + hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr)); + + if (hdr.nvpf_magic != NVPF_HDR_MAGIC || + hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) { + kobj_close_file(file); + if (hdrsum != cksum) { + nvf_error("%s: checksum error " + "(actual 0x%x, expected 0x%x)\n", + filename, hdrsum, cksum); + } + nvf_error("%s: header information incorrect", filename); + return (EINVAL); + } + + ASSERT(hdr.nvpf_size >= 0); + + buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP); + n = kobj_read_file(file, buf, hdr.nvpf_size, offset); + if (n != hdr.nvpf_size) { + kmem_free(buf, hdr.nvpf_size); + kobj_close_file(file); + if (n < 0) { + nvf_error("%s: read error %d", filename, n); + } else { + nvf_error("%s: incomplete read %d/%lld", + filename, n, (longlong_t)hdr.nvpf_size); + } + return (EINVAL); + } + offset += n; + + rval = kobj_read_file(file, &c, 1, offset); + kobj_close_file(file); + if (rval > 0) { + nvf_error("%s is larger than %lld\n", + filename, (longlong_t)hdr.nvpf_size); + kmem_free(buf, hdr.nvpf_size); + return (EINVAL); + } + + cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size); + if (hdr.nvpf_chksum != cksum) { + nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n", + filename, hdr.nvpf_chksum, cksum); + kmem_free(buf, hdr.nvpf_size); + return (EINVAL); + } + + nvl = NULL; + rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0); + if (rval != 0) { + nvf_error("%s: error %d unpacking nvlist\n", + filename, rval); + kmem_free(buf, hdr.nvpf_size); + return (EINVAL); + } + + kmem_free(buf, hdr.nvpf_size); + *ret_nvlist = nvl; + return (0); +} + +static int +kfcreate(char *filename, kfile_t **kfilep) +{ + kfile_t *fp; + int rval; + + ASSERT(modrootloaded); + + fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP); + + fp->kf_vnflags = FCREAT | FWRITE | FTRUNC; + fp->kf_fname = filename; + fp->kf_fpos = 0; + fp->kf_state = 0; + + KFDEBUG((CE_CONT, "create: %s flags 0x%x\n", + filename, fp->kf_vnflags)); + rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags, + 0444, &fp->kf_vp, CRCREAT, 0); + if (rval != 0) { + kmem_free(fp, sizeof (kfile_t)); + KFDEBUG((CE_CONT, "%s: create error %d\n", + filename, rval)); + return (rval); + } + + *kfilep = fp; + return (0); +} + +static int +kfremove(char *filename) +{ + int rval; + + KFDEBUG((CE_CONT, "remove: %s\n", filename)); + rval = vn_remove(filename, UIO_SYSSPACE, RMFILE); + if (rval != 0) { + KFDEBUG((CE_CONT, "%s: remove error %d\n", + filename, rval)); + } + return (rval); +} + +static int +kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n) +{ + ssize_t resid; + int err; + ssize_t n; + + ASSERT(modrootloaded); + + if (fp->kf_state != 0) + return (fp->kf_state); + + err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos, + UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid); + if (err != 0) { + KFDEBUG((CE_CONT, "%s: read error %d\n", + fp->kf_fname, err)); + fp->kf_state = err; + return (err); + } + + ASSERT(resid >= 0 && resid <= bufsiz); + n = bufsiz - resid; + + KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n", + fp->kf_fname, n, bufsiz, resid)); + + fp->kf_fpos += n; + *ret_n = n; + return (0); +} + +static int +kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n) +{ + rlim64_t rlimit; + ssize_t resid; + int err; + ssize_t len; + ssize_t n = 0; + + ASSERT(modrootloaded); + + if (fp->kf_state != 0) + return (fp->kf_state); + + len = bufsiz; + rlimit = bufsiz + 1; + for (;;) { + err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos, + UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid); + if (err) { + KFDEBUG((CE_CONT, "%s: write error %d\n", + fp->kf_fname, err)); + fp->kf_state = err; + return (err); + } + + KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n", + fp->kf_fname, len-resid, resid)); + + ASSERT(resid >= 0 && resid <= len); + + n += (len - resid); + if (resid == 0) + break; + + if (resid == len) { + KFDEBUG((CE_CONT, "%s: filesystem full?\n", + fp->kf_fname)); + fp->kf_state = ENOSPC; + return (ENOSPC); + } + + len -= resid; + buf += len; + fp->kf_fpos += len; + len = resid; + } + + ASSERT(n == bufsiz); + KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n)); + + *ret_n = n; + return (0); +} + + +static int +kfclose(kfile_t *fp) +{ + int rval; + + KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname)); + + if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) { + rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred); + if (rval != 0) { + nvf_error("%s: sync error %d\n", + fp->kf_fname, rval); + } + KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname)); + } + + rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred); + if (rval != 0) { + if (fp->kf_state == 0) { + nvf_error("%s: close error %d\n", + fp->kf_fname, rval); + } + } else { + if (fp->kf_state == 0) + KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname)); + } + + VN_RELE(fp->kf_vp); + kmem_free(fp, sizeof (kfile_t)); + return (rval); +} + +static int +kfrename(char *oldname, char *newname) +{ + int rval; + + ASSERT(modrootloaded); + + KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname)); + + if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) { + KFDEBUG((CE_CONT, "rename %s to %s: %d\n", + oldname, newname, rval)); + } + + return (rval); +} + +int +fwrite_nvlist(char *filename, nvlist_t *nvl) +{ + char *buf; + char *nvbuf; + kfile_t *fp; + char *newname; + int len, err, err1; + size_t buflen; + ssize_t n; + + ASSERT(modrootloaded); + + nvbuf = NULL; + err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0); + if (err != 0) { + nvf_error("%s: error %d packing nvlist\n", + filename, err); + return (err); + } + + buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP); + bzero(buf, sizeof (nvpf_hdr_t)); + + ((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC; + ((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION; + ((nvpf_hdr_t *)buf)->nvpf_size = buflen; + ((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen); + ((nvpf_hdr_t *)buf)->nvpf_hdr_chksum = + nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t)); + + bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen); + kmem_free(nvbuf, buflen); + buflen += sizeof (nvpf_hdr_t); + + len = strlen(filename) + MAX_SUFFIX_LEN + 2; + newname = kmem_alloc(len, KM_SLEEP); + + + (void) sprintf(newname, "%s.%s", + filename, NEW_FILENAME_SUFFIX); + + /* + * To make it unlikely we suffer data loss, write + * data to the new temporary file. Once successful + * complete the transaction by renaming the new file + * to replace the previous. + */ + + if ((err = kfcreate(newname, &fp)) == 0) { + err = kfwrite(fp, buf, buflen, &n); + if (err) { + nvf_error("%s: write error - %d\n", + newname, err); + } else { + if (n != buflen) { + nvf_error( + "%s: partial write %ld of %ld bytes\n", + newname, n, buflen); + nvf_error("%s: filesystem may be full?\n", + newname); + err = EIO; + } + } + if ((err1 = kfclose(fp)) != 0) { + nvf_error("%s: close error\n", newname); + if (err == 0) + err = err1; + } + if (err != 0) { + if (kfremove(newname) != 0) { + nvf_error("%s: remove failed\n", + newname); + } + } + } else { + nvf_error("%s: create failed - %d\n", filename, err); + } + + if (err == 0) { + if ((err = kfrename(newname, filename)) != 0) { + nvf_error("%s: rename from %s failed\n", + newname, filename); + } + } + + kmem_free(newname, len); + kmem_free(buf, buflen); + + return (err); +} + +static int +e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl) +{ + int err; + + if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0) + return (DDI_SUCCESS); + else { + if (err == EROFS) + NVF_MARK_READONLY(nvfd); + return (DDI_FAILURE); + } +} + +static void +nvp_list_free(nvfd_t *nvf) +{ + ASSERT(RW_WRITE_HELD(&nvf->nvf_lock)); + (nvf->nvf_list_free)((nvf_handle_t)nvf); + ASSERT(RW_WRITE_HELD(&nvf->nvf_lock)); +} + +/* + * Read a file in the nvlist format + * EIO - i/o error during read + * ENOENT - file not found + * EINVAL - file contents corrupted + */ +static int +fread_nvp_list(nvfd_t *nvfd) +{ + nvlist_t *nvl; + nvpair_t *nvp; + char *name; + nvlist_t *sublist; + int rval; + int rv; + + ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); + + rval = fread_nvlist(nvfd->nvf_cache_path, &nvl); + if (rval != 0) + return (rval); + ASSERT(nvl != NULL); + + nvp = NULL; + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + name = nvpair_name(nvp); + ASSERT(strlen(name) > 0); + + switch (nvpair_type(nvp)) { + case DATA_TYPE_NVLIST: + rval = nvpair_value_nvlist(nvp, &sublist); + if (rval != 0) { + nvf_error( + "nvpair_value_nvlist error %s %d\n", + name, rval); + goto error; + } + + /* + * unpack nvlist for this device and + * add elements to data list. + */ + ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); + rv = (nvfd->nvf_unpack_nvlist) + ((nvf_handle_t)nvfd, sublist, name); + ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock))); + if (rv != 0) { + nvf_error( + "%s: %s invalid list element\n", + nvfd->nvf_cache_path, name); + rval = EINVAL; + goto error; + } + break; + + default: + nvf_error("%s: %s unsupported data type %d\n", + nvfd->nvf_cache_path, name, nvpair_type(nvp)); + rval = EINVAL; + goto error; + } + } + + nvlist_free(nvl); + + return (0); + +error: + nvlist_free(nvl); + nvp_list_free(nvfd); + return (rval); +} + + +int +nvf_read_file(nvf_handle_t nvf_handle) +{ + nvfd_t *nvfd = (nvfd_t *)nvf_handle; + int rval; + + ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); + + if (kfio_disable_read) + return (0); + + KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path)); + + rval = fread_nvp_list(nvfd); + if (rval) { + switch (rval) { + case EIO: + nvfd->nvf_flags |= NVF_F_REBUILD_MSG; + cmn_err(CE_WARN, "%s: I/O error", + nvfd->nvf_cache_path); + break; + case ENOENT: + nvfd->nvf_flags |= NVF_F_CREATE_MSG; + nvf_error("%s: not found\n", + nvfd->nvf_cache_path); + break; + case EINVAL: + default: + nvfd->nvf_flags |= NVF_F_REBUILD_MSG; + cmn_err(CE_WARN, "%s: data file corrupted", + nvfd->nvf_cache_path); + break; + } + } + return (rval); +} + +static void +nvf_write_is_complete(nvfd_t *fd) +{ + if (fd->nvf_write_complete) { + (fd->nvf_write_complete)((nvf_handle_t)fd); + } +} + +/*ARGSUSED*/ +static void +nvpflush_timeout(void *arg) +{ + clock_t nticks; + + mutex_enter(&nvpflush_lock); + nticks = nvpticks - ddi_get_lbolt(); + if (nticks > 4) { + nvpflush_timer_busy = 1; + mutex_exit(&nvpflush_lock); + nvpflush_id = timeout(nvpflush_timeout, NULL, nticks); + } else { + do_nvpflush = 1; + NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n")); + cv_signal(&nvpflush_cv); + nvpflush_id = 0; + nvpflush_timer_busy = 0; + mutex_exit(&nvpflush_lock); + } +} + +/* + * After marking a list as dirty, wake the nvpflush daemon + * to perform the update. + */ +void +nvf_wake_daemon(void) +{ + clock_t nticks; + + /* + * If the system isn't up yet + * don't even think about starting a flush. + */ + if (!i_ddi_io_initialized()) + return; + + mutex_enter(&nvpflush_lock); + + if (nvpflush_daemon_active == 0) { + nvpflush_daemon_active = 1; + mutex_exit(&nvpflush_lock); + NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n")); + nvpflush_thr_id = thread_create(NULL, 0, + (void (*)())nvpflush_daemon, + NULL, 0, &p0, TS_RUN, minclsyspri); + mutex_enter(&nvpflush_lock); + } + + nticks = nvpflush_delay * TICKS_PER_SECOND; + nvpticks = ddi_get_lbolt() + nticks; + if (nvpflush_timer_busy == 0) { + nvpflush_timer_busy = 1; + mutex_exit(&nvpflush_lock); + nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4); + } else + mutex_exit(&nvpflush_lock); +} + +static int +nvpflush_one(nvfd_t *nvfd) +{ + int rval = DDI_SUCCESS; + nvlist_t *nvl; + + rw_enter(&nvfd->nvf_lock, RW_READER); + + ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0); + + if (!NVF_IS_DIRTY(nvfd) || + NVF_IS_READONLY(nvfd) || kfio_disable_write) { + NVF_CLEAR_DIRTY(nvfd); + rw_exit(&nvfd->nvf_lock); + return (DDI_SUCCESS); + } + + if (rw_tryupgrade(&nvfd->nvf_lock) == 0) { + nvf_error("nvpflush: " + "%s rw upgrade failed\n", nvfd->nvf_cache_path); + rw_exit(&nvfd->nvf_lock); + return (DDI_FAILURE); + } + if (((nvfd->nvf_pack_list) + ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) { + nvf_error("nvpflush: " + "%s nvlist construction failed\n", nvfd->nvf_cache_path); + ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); + rw_exit(&nvfd->nvf_lock); + return (DDI_FAILURE); + } + ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock)); + + NVF_CLEAR_DIRTY(nvfd); + nvfd->nvf_flags |= NVF_F_FLUSHING; + rw_exit(&nvfd->nvf_lock); + + rval = e_fwrite_nvlist(nvfd, nvl); + nvlist_free(nvl); + + rw_enter(&nvfd->nvf_lock, RW_WRITER); + nvfd->nvf_flags &= ~NVF_F_FLUSHING; + if (rval == DDI_FAILURE) { + if (NVF_IS_READONLY(nvfd)) { + rval = DDI_SUCCESS; + nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY); + } else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) { + cmn_err(CE_CONT, + "%s: updated failed\n", nvfd->nvf_cache_path); + nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY; + } + } else { + if (nvfd->nvf_flags & NVF_F_CREATE_MSG) { + cmn_err(CE_CONT, + "!Creating %s\n", nvfd->nvf_cache_path); + nvfd->nvf_flags &= ~NVF_F_CREATE_MSG; + } + if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) { + cmn_err(CE_CONT, + "!Rebuilding %s\n", nvfd->nvf_cache_path); + nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG; + } + if (nvfd->nvf_flags & NVF_F_ERROR) { + cmn_err(CE_CONT, + "%s: update now ok\n", nvfd->nvf_cache_path); + nvfd->nvf_flags &= ~NVF_F_ERROR; + } + /* + * The file may need to be flushed again if the cached + * data was touched while writing the earlier contents. + */ + if (NVF_IS_DIRTY(nvfd)) + rval = DDI_FAILURE; + } + + rw_exit(&nvfd->nvf_lock); + return (rval); +} + + +static void +nvpflush_daemon(void) +{ + callb_cpr_t cprinfo; + nvfd_t *nvfdp, *nextfdp; + clock_t clk; + int rval; + int want_wakeup; + int is_now_clean; + + ASSERT(modrootloaded); + + nvpflush_thread = curthread; + NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n")); + + CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp"); + mutex_enter(&nvpflush_lock); + for (;;) { + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (do_nvpflush == 0) { + clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock, + ddi_get_lbolt() + + (nvpdaemon_idle_time * TICKS_PER_SECOND)); + if (clk == -1 && + do_nvpflush == 0 && nvpflush_timer_busy == 0) { + /* + * Note that CALLB_CPR_EXIT calls mutex_exit() + * on the lock passed in to CALLB_CPR_INIT, + * so the lock must be held when invoking it. + */ + CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock); + NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n")); + ASSERT(mutex_owned(&nvpflush_lock)); + nvpflush_thr_id = NULL; + nvpflush_daemon_active = 0; + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); + } + } + CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock); + + nvpbusy = 1; + want_wakeup = 0; + do_nvpflush = 0; + mutex_exit(&nvpflush_lock); + + /* + * Try flushing what's dirty, reschedule if there's + * a failure or data gets marked as dirty again. + * First move each file marked dirty to the dirty + * list to avoid locking the list across the write. + */ + mutex_enter(&nvf_cache_mutex); + for (nvfdp = list_head(&nvf_cache_files); + nvfdp; nvfdp = nextfdp) { + nextfdp = list_next(&nvf_cache_files, nvfdp); + rw_enter(&nvfdp->nvf_lock, RW_READER); + if (NVF_IS_DIRTY(nvfdp)) { + list_remove(&nvf_cache_files, nvfdp); + list_insert_tail(&nvf_dirty_files, nvfdp); + rw_exit(&nvfdp->nvf_lock); + } else { + NVPDAEMON_DEBUG((CE_CONT, + "nvpdaemon: not dirty %s\n", + nvfdp->nvf_cache_path)); + rw_exit(&nvfdp->nvf_lock); + } + } + mutex_exit(&nvf_cache_mutex); + + /* + * Now go through the dirty list + */ + for (nvfdp = list_head(&nvf_dirty_files); + nvfdp; nvfdp = nextfdp) { + nextfdp = list_next(&nvf_dirty_files, nvfdp); + + is_now_clean = 0; + rw_enter(&nvfdp->nvf_lock, RW_READER); + if (NVF_IS_DIRTY(nvfdp)) { + NVPDAEMON_DEBUG((CE_CONT, + "nvpdaemon: flush %s\n", + nvfdp->nvf_cache_path)); + rw_exit(&nvfdp->nvf_lock); + rval = nvpflush_one(nvfdp); + rw_enter(&nvfdp->nvf_lock, RW_READER); + if (rval != DDI_SUCCESS || + NVF_IS_DIRTY(nvfdp)) { + rw_exit(&nvfdp->nvf_lock); + NVPDAEMON_DEBUG((CE_CONT, + "nvpdaemon: %s dirty again\n", + nvfdp->nvf_cache_path)); + want_wakeup = 1; + } else { + rw_exit(&nvfdp->nvf_lock); + nvf_write_is_complete(nvfdp); + is_now_clean = 1; + } + } else { + NVPDAEMON_DEBUG((CE_CONT, + "nvpdaemon: not dirty %s\n", + nvfdp->nvf_cache_path)); + rw_exit(&nvfdp->nvf_lock); + is_now_clean = 1; + } + + if (is_now_clean) { + mutex_enter(&nvf_cache_mutex); + list_remove(&nvf_dirty_files, nvfdp); + list_insert_tail(&nvf_cache_files, + nvfdp); + mutex_exit(&nvf_cache_mutex); + } + } + + if (want_wakeup) + nvf_wake_daemon(); + + mutex_enter(&nvpflush_lock); + nvpbusy = 0; + } +} |
