diff options
| author | Dave Plauger <Dave.Plauger@Sun.COM> | 2009-10-22 20:06:52 -0400 |
|---|---|---|
| committer | Dave Plauger <Dave.Plauger@Sun.COM> | 2009-10-22 20:06:52 -0400 |
| commit | ca3e8d88e8c867355e441fbc914c52e7416fc537 (patch) | |
| tree | 27934a23f3f293cfac68ec2188db5bf26361c12e /usr/src/uts/common | |
| parent | c9cc1492d5b27b76cf77300ab3aafd0857f38228 (diff) | |
| download | illumos-joyent-ca3e8d88e8c867355e441fbc914c52e7416fc537.tar.gz | |
6828976 Fast Crash Dump
6878030 live crash dump is much slower than reboot dump
6626023 Crash dump size is excessive on large memory machines
Diffstat (limited to 'usr/src/uts/common')
| -rw-r--r-- | usr/src/uts/common/Makefile.files | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/Makefile.rules | 9 | ||||
| -rw-r--r-- | usr/src/uts/common/os/dumpsubr.c | 2362 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/dumphdr.h | 81 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/hat.h | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/vm_page.c | 14 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/vm_pagelist.c | 4 |
7 files changed, 2287 insertions, 194 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 8826e3029c..e744cdef33 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -98,6 +98,13 @@ GENUNIX_OBJS += \ bitmap.o \ blabel.o \ brandsys.o \ + bz2blocksort.o \ + bz2compress.o \ + bz2decompress.o \ + bz2randtable.o \ + bz2bzlib.o \ + bz2crctable.o \ + bz2huffman.o \ callb.o \ callout.o \ chdir.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 4003243568..72f3e59d3e 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -1463,6 +1463,15 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/tpm/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/tpm/%.s $(COMPILE.s) -o $@ $< +$(OBJS_DIR)/bz2%.o: $(COMMONBASE)/bzip2/%.c + $(COMPILE.c) -o $@ -I$(COMMONBASE)/bzip2 $< + $(CTFCONVERT_O) + +BZ2LINT = -erroff=%all -I$(UTSBASE)/common/bzip2 + +$(LINTS_DIR)/bz2%.ln: $(COMMONBASE)/bzip2/%.c + @($(LHEAD) $(LINT.c) -C $(LINTS_DIR)/`basename $@ .ln` $(BZ2LINT) $< $(LTAIL)) + # # SVM # diff --git a/usr/src/uts/common/os/dumpsubr.c b/usr/src/uts/common/os/dumpsubr.c index e29679a985..d7c93ada2e 100644 --- a/usr/src/uts/common/os/dumpsubr.c +++ b/usr/src/uts/common/os/dumpsubr.c @@ -66,87 +66,980 @@ #include <vm/hat.h> #include <vm/as.h> #include <vm/page.h> +#include <vm/pvn.h> #include <vm/seg.h> #include <vm/seg_kmem.h> -kmutex_t dump_lock; /* lock for dump configuration */ -dumphdr_t *dumphdr; /* dump header */ +#include <bzip2/bzlib.h> + +/* + * Crash dump time is dominated by disk write time. To reduce this, + * the stronger compression method bzip2 is applied to reduce the dump + * size and hence reduce I/O time. However, bzip2 is much more + * computationally expensive than the existing lzjb algorithm, so to + * avoid increasing compression time, CPUs that are otherwise idle + * during panic are employed to parallelize the compression task. + * Many helper CPUs are needed to prevent bzip2 from being a + * bottleneck, and on systems with too few CPUs, the lzjb algorithm is + * parallelized instead. Lastly, I/O and compression are performed by + * different CPUs, and are hence overlapped in time, unlike the older + * serial code. + * + * Another important consideration is the speed of the dump + * device. Faster disks need less CPUs in order to benefit from + * parallel lzjb versus parallel bzip2. Therefore, the CPU count + * threshold for switching from parallel lzjb to paralled bzip2 is + * elevated for faster disks. The dump device speed is adduced from + * the setting for dumpbuf.iosize, see dump_update_clevel. + */ + +/* + * exported vars + */ +kmutex_t dump_lock; /* lock for dump configuration */ +dumphdr_t *dumphdr; /* dump header */ int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ -vnode_t *dumpvp; /* dump device vnode pointer */ -u_offset_t dumpvp_size; /* size of dump device, in bytes */ -static u_offset_t dumpvp_limit; /* maximum write offset */ -char *dumppath; /* pathname of dump device */ -int dump_timeout = 120; /* timeout for dumping page during panic */ -int dump_timeleft; /* portion of dump_timeout remaining */ -int dump_ioerr; /* dump i/o error */ +vnode_t *dumpvp; /* dump device vnode pointer */ +u_offset_t dumpvp_size; /* size of dump device, in bytes */ +char *dumppath; /* pathname of dump device */ +int dump_timeout = 120; /* timeout for dumping pages */ +int dump_timeleft; /* portion of dump_timeout remaining */ +int dump_ioerr; /* dump i/o error */ +int dump_check_used; /* enable check for used pages */ + +/* + * Tunables for dump compression and parallelism. These can be set via + * /etc/system. + * + * dump_ncpu_low number of helpers for parallel lzjb + * This is also the minimum configuration. + * + * dump_bzip2_level bzip2 compression level: 1-9 + * Higher numbers give greater compression, but take more memory + * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). + * + * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): + * if dump_plat_mincpu == 0, then always do single threaded dump + * if ncpu >= dump_plat_mincpu then try to use bzip2 + * + * dump_metrics_on if set, metrics are collected in the kernel, passed + * to savecore via the dump file, and recorded by savecore in + * METRICS.txt. + */ +uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ +uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ + +/* Define multiple buffers per helper to avoid stalling */ +#define NCBUF_PER_HELPER 2 +#define NCMAP_PER_HELPER 4 + +/* minimum number of helpers configured */ +#define MINHELPERS (dump_ncpu_low) +#define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) + +/* + * Define constant parameters. + * + * CBUF_SIZE size of an output buffer + * + * CBUF_MAPSIZE size of virtual range for mapping pages + * + * CBUF_MAPNP size of virtual range in pages + * + */ +#define DUMP_1KB ((size_t)1 << 10) +#define DUMP_1MB ((size_t)1 << 20) +#define CBUF_SIZE ((size_t)1 << 17) +#define CBUF_MAPSHIFT (22) +#define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) +#define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) + +/* + * Compression metrics are accumulated nano-second subtotals. The + * results are normalized by the number of pages dumped. A report is + * generated when dumpsys() completes and is saved in the dump image + * after the trailing dump header. + * + * Metrics are always collected. Set the variable dump_metrics_on to + * cause metrics to be saved in the crash file, where savecore will + * save it in the file METRICS.txt. + */ +#define PERPAGES \ + PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ + PERPAGE(copy) PERPAGE(compress) \ + PERPAGE(write) \ + PERPAGE(inwait) PERPAGE(outwait) + +typedef struct perpage { +#define PERPAGE(x) hrtime_t x; + PERPAGES +#undef PERPAGE +} perpage_t; + +/* + * This macro controls the code generation for collecting dump + * performance information. By default, the code is generated, but + * automatic saving of the information is disabled. If dump_metrics_on + * is set to 1, the timing information is passed to savecore via the + * crash file, where it is appended to the file dump-dir/METRICS.txt. + */ +#define COLLECT_METRICS + +#ifdef COLLECT_METRICS +uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ + +#define HRSTART(v, m) v##ts.m = gethrtime() +#define HRSTOP(v, m) v.m += gethrtime() - v##ts.m +#define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s +#define HREND(v, m) v.m += gethrtime() - v##ts.m +#define HRNORM(v, m, n) v.m /= (n) -#ifdef DEBUG -int dumpfaildebug = 1; /* enter debugger if dump fails */ #else -int dumpfaildebug = 0; -#endif +#define HRSTART(v, m) +#define HRSTOP(v, m) +#define HRBEGIN(v, m, s) +#define HREND(v, m) +#define HRNORM(v, m, n) +#endif /* COLLECT_METRICS */ -static ulong_t *dump_bitmap; /* bitmap for marking pages to dump */ -static pgcnt_t dump_bitmapsize; /* size of bitmap */ -static pid_t *dump_pids; /* list of process IDs at dump time */ -static offset_t dumpvp_off; /* current dump device offset */ -static char *dump_cmap; /* VA for dump compression mapping */ -static char *dumpbuf_cur, *dumpbuf_start, *dumpbuf_end; -static char *dump_cbuf; /* compression buffer */ -static char *dump_uebuf; /* memory error detection buffer */ -static size_t dumpbuf_size; /* size of dumpbuf in bytes */ -static size_t dumpbuf_limit = 1UL << 23; /* 8MB */ -static size_t dump_iosize; /* device's best transfer size, if any */ -static uint64_t dumpbuf_thresh = 1ULL << 30; /* 1GB */ -static ulong_t dumpbuf_mult = 8; - -/* - * The dump i/o buffer must be at least one page, at most xfer_size bytes, and - * should scale with physmem in between. The transfer size passed in will - * either represent a global default (maxphys) or the best size for the device. - * Once the physical memory size exceeds dumpbuf_thresh (1GB by default), we - * increase the percentage of physical memory that dumpbuf can consume by a - * factor of dumpbuf_mult (8 by default) to improve large memory performance. - * The size of the dumpbuf i/o buffer is limited by dumpbuf_limit (8MB by - * default) because the dump performance saturates beyond a certain size. +/* + * Buffers for copying and compressing memory pages. + * + * cbuf_t buffer controllers: used for both input and output. + * + * The buffer state indicates how it is being used: + * + * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for + * mapping input pages. + * + * CBUF_INREADY: input pages are mapped and ready for compression by a + * helper. + * + * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. + * + * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. + * + * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, + * ready to write out. + * + * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper + * (reports UE errors.) + */ + +typedef enum cbufstate { + CBUF_FREEMAP, + CBUF_INREADY, + CBUF_USEDMAP, + CBUF_FREEBUF, + CBUF_WRITE, + CBUF_ERRMSG +} cbufstate_t; + +typedef struct cbuf cbuf_t; + +struct cbuf { + cbuf_t *next; /* next in list */ + cbufstate_t state; /* processing state */ + size_t used; /* amount used */ + size_t size; /* mem size */ + char *buf; /* kmem or vmem */ + pgcnt_t pagenum; /* index to pfn map */ + pgcnt_t bitnum; /* first set bitnum */ + pfn_t pfn; /* first pfn in mapped range */ + int off; /* byte offset to first pfn */ +}; + +/* + * cqueue_t queues: a uni-directional channel for communication + * from the master to helper tasks or vice-versa using put and + * get primitives. Both mappings and data buffers are passed via + * queues. Producers close a queue when done. The number of + * active producers is reference counted so the consumer can + * detect end of data. Concurrent access is mediated by atomic + * operations for panic dump, or mutex/cv for live dump. + * + * There a four queues, used as follows: + * + * Queue Dataflow NewState + * -------------------------------------------------- + * mainq master -> master FREEMAP + * master has initialized or unmapped an input buffer + * -------------------------------------------------- + * helperq master -> helper INREADY + * master has mapped input for use by helper + * -------------------------------------------------- + * mainq master <- helper USEDMAP + * helper is done with input + * -------------------------------------------------- + * freebufq master -> helper FREEBUF + * master has initialized or written an output buffer + * -------------------------------------------------- + * mainq master <- helper WRITE + * block of compressed pages from a helper + * -------------------------------------------------- + * mainq master <- helper ERRMSG + * error messages from a helper (memory error case) + * -------------------------------------------------- + * writerq master <- master WRITE + * non-blocking queue of blocks to write + * -------------------------------------------------- + */ +typedef struct cqueue { + cbuf_t *volatile first; /* first in list */ + cbuf_t *last; /* last in list */ + hrtime_t ts; /* timestamp */ + hrtime_t empty; /* total time empty */ + kmutex_t mutex; /* live state lock */ + kcondvar_t cv; /* live wait var */ + lock_t spinlock; /* panic mode spin lock */ + volatile uint_t open; /* producer ref count */ +} cqueue_t; + +/* + * Convenience macros for using the cqueue functions + * Note that the caller must have defined "dumpsync_t *ds" + */ +#define CQ_IS_EMPTY(q) \ + (ds->q.first == NULL) + +#define CQ_OPEN(q) \ + atomic_inc_uint(&ds->q.open) + +#define CQ_CLOSE(q) \ + dumpsys_close_cq(&ds->q, ds->live) + +#define CQ_PUT(q, cp, st) \ + dumpsys_put_cq(&ds->q, cp, st, ds->live) + +#define CQ_GET(q) \ + dumpsys_get_cq(&ds->q, ds->live) + +/* + * Dynamic state when dumpsys() is running. */ +typedef struct dumpsync { + pgcnt_t npages; /* subtotal of pages dumped */ + pgcnt_t pages_mapped; /* subtotal of pages mapped */ + pgcnt_t pages_used; /* subtotal of pages used per map */ + size_t nwrite; /* subtotal of bytes written */ + uint_t live; /* running live dump */ + uint_t neednl; /* will need to print a newline */ + uint_t percent; /* dump progress */ + uint_t percent_done; /* dump progress reported */ + cqueue_t freebufq; /* free kmem bufs for writing */ + cqueue_t mainq; /* input for main task */ + cqueue_t helperq; /* input for helpers */ + cqueue_t writerq; /* input for writer */ + hrtime_t start; /* start time */ + hrtime_t elapsed; /* elapsed time when completed */ + hrtime_t iotime; /* time spent writing nwrite bytes */ + hrtime_t iowait; /* time spent waiting for output */ + hrtime_t iowaitts; /* iowait timestamp */ + perpage_t perpage; /* metrics */ + perpage_t perpagets; + int dumpcpu; /* master cpu */ +} dumpsync_t; + +static dumpsync_t dumpsync; /* synchronization vars */ + +/* + * helper_t helpers: contains the context for a stream. CPUs run in + * parallel at dump time; each CPU creates a single stream of + * compression data. Stream data is divided into CBUF_SIZE blocks. + * The blocks are written in order within a stream. But, blocks from + * multiple streams can be interleaved. Each stream is identified by a + * unique tag. + */ +typedef struct helper { + int helper; /* bound helper id */ + int tag; /* compression stream tag */ + perpage_t perpage; /* per page metrics */ + perpage_t perpagets; /* per page metrics (timestamps) */ + taskqid_t taskqid; /* live dump task ptr */ + int in, out; /* buffer offsets */ + cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ + dumpsync_t *ds; /* pointer to sync vars */ + size_t used; /* counts input consumed */ + char *page; /* buffer for page copy */ + char *lzbuf; /* lzjb output */ + bz_stream bzstream; /* bzip2 state */ +} helper_t; + +#define MAINHELPER (-1) /* helper is also the main task */ +#define FREEHELPER (-2) /* unbound helper */ +#define DONEHELPER (-3) /* helper finished */ + +/* + * configuration vars for dumpsys + */ +typedef struct dumpcfg { + int threshold; /* ncpu threshold for bzip2 */ + int nhelper; /* number of helpers */ + int nhelper_used; /* actual number of helpers used */ + int ncmap; /* number VA pages for compression */ + int ncbuf; /* number of bufs for compression */ + int ncbuf_used; /* number of bufs in use */ + uint_t clevel; /* dump compression level */ + helper_t *helper; /* array of helpers */ + cbuf_t *cmap; /* array of input (map) buffers */ + cbuf_t *cbuf; /* array of output buffers */ + ulong_t *helpermap; /* set of dumpsys helper CPU ids */ + ulong_t *bitmap; /* bitmap for marking pages to dump */ + ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ + pgcnt_t bitmapsize; /* size of bitmap */ + pgcnt_t rbitmapsize; /* size of bitmap for ranges */ + pgcnt_t found4m; /* number ranges allocated by dump */ + pgcnt_t foundsm; /* number small pages allocated by dump */ + pid_t *pids; /* list of process IDs at dump time */ + size_t maxsize; /* memory size needed at dump time */ + size_t maxvmsize; /* size of reserved VM */ + char *maxvm; /* reserved VM for spare pages */ + lock_t helper_lock; /* protect helper state */ + char helpers_wanted; /* flag to enable parallelism */ +} dumpcfg_t; + +static dumpcfg_t dumpcfg; /* config vars */ + +/* + * The dump I/O buffer. + * + * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is + * sized according to the optimum device transfer speed. + */ +typedef struct dumpbuf { + vnode_t *cdev_vp; /* VCHR open of the dump device */ + len_t vp_limit; /* maximum write offset */ + offset_t vp_off; /* current dump device offset */ + char *cur; /* dump write pointer */ + char *start; /* dump buffer address */ + char *end; /* dump buffer end */ + size_t size; /* size of dumpbuf in bytes */ + size_t iosize; /* best transfer size for device */ +} dumpbuf_t; + +dumpbuf_t dumpbuf; /* I/O buffer */ + +/* + * The dump I/O buffer must be at least one page, at most xfer_size + * bytes, and should scale with physmem in between. The transfer size + * passed in will either represent a global default (maxphys) or the + * best size for the device. The size of the dumpbuf I/O buffer is + * limited by dumpbuf_limit (8MB by default) because the dump + * performance saturates beyond a certain size. The default is to + * select 1/4096 of the memory. + */ +static int dumpbuf_fraction = 12; /* memory size scale factor */ +static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ + static size_t dumpbuf_iosize(size_t xfer_size) { - pgcnt_t scale = physmem; - size_t iosize; - - if (scale >= dumpbuf_thresh / PAGESIZE) { - scale *= dumpbuf_mult; /* increase scaling factor */ - iosize = MIN(xfer_size, scale) & PAGEMASK; - if (dumpbuf_limit && iosize > dumpbuf_limit) - iosize = MAX(PAGESIZE, dumpbuf_limit & PAGEMASK); - } else - iosize = MAX(PAGESIZE, MIN(xfer_size, scale) & PAGEMASK); - - return (iosize); + size_t iosize = ptob(physmem >> dumpbuf_fraction); + + if (iosize < PAGESIZE) + iosize = PAGESIZE; + else if (iosize > xfer_size) + iosize = xfer_size; + if (iosize > dumpbuf_limit) + iosize = dumpbuf_limit; + return (iosize & PAGEMASK); } +/* + * resize the I/O buffer + */ static void dumpbuf_resize(void) { - char *old_buf = dumpbuf_start; - size_t old_size = dumpbuf_size; + char *old_buf = dumpbuf.start; + size_t old_size = dumpbuf.size; char *new_buf; size_t new_size; ASSERT(MUTEX_HELD(&dump_lock)); - if ((new_size = dumpbuf_iosize(MAX(dump_iosize, maxphys))) <= old_size) + new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); + if (new_size <= old_size) return; /* no need to reallocate buffer */ new_buf = kmem_alloc(new_size, KM_SLEEP); - dumpbuf_size = new_size; - dumpbuf_start = new_buf; - dumpbuf_end = new_buf + new_size; + dumpbuf.size = new_size; + dumpbuf.start = new_buf; + dumpbuf.end = new_buf + new_size; kmem_free(old_buf, old_size); } +/* + * dump_update_clevel is called when dumpadm configures the dump device. + * Calculate number of helpers and buffers. + * Allocate the minimum configuration for now. + * + * When the dump file is configured we reserve a minimum amount of + * memory for use at crash time. But we reserve VA for all the memory + * we really want in order to do the fastest dump possible. The VA is + * backed by pages not being dumped, according to the bitmap. If + * there is insufficient spare memory, however, we fall back to the + * minimum. + * + * Live dump (savecore -L) always uses the minimum config. + * + * clevel 0 is single threaded lzjb + * clevel 1 is parallel lzjb + * clevel 2 is parallel bzip2 + * + * The ncpu threshold is selected with dump_plat_mincpu. + * On OPL, set_platform_defaults() overrides the sun4u setting. + * The actual values are defined via DUMP_PLAT_*_MINCPU macros. + * + * Architecture Threshold Algorithm + * sun4u < 51 parallel lzjb + * sun4u >= 51 parallel bzip2(*) + * sun4u OPL < 8 parallel lzjb + * sun4u OPL >= 8 parallel bzip2(*) + * sun4v < 128 parallel lzjb + * sun4v >= 128 parallel bzip2(*) + * x86 < 11 parallel lzjb + * x86 >= 11 parallel bzip2(*) + * 32-bit N/A single-threaded lzjb + * + * (*) bzip2 is only chosen if there is sufficient available + * memory for buffers at dump time. See dumpsys_get_maxmem(). + * + * Faster dump devices have larger I/O buffers. The threshold value is + * increased according to the size of the dump I/O buffer, because + * parallel lzjb performs better with faster disks. For buffers >= 1MB + * the threshold is 3X; for buffers >= 256K threshold is 2X. + * + * For parallel dumps, the number of helpers is ncpu-1. The CPU + * running panic runs the main task. For single-threaded dumps, the + * panic CPU does lzjb compression (it is tagged as MAINHELPER.) + * + * Need multiple buffers per helper so that they do not block waiting + * for the main task. + * parallel single-threaded + * Number of output buffers: nhelper*2 1 + * Number of mapping buffers: nhelper*4 1 + * + */ +static void +dump_update_clevel() +{ + int tag; + size_t bz2size; + helper_t *hp, *hpend; + cbuf_t *cp, *cpend; + dumpcfg_t *old = &dumpcfg; + dumpcfg_t newcfg = *old; + dumpcfg_t *new = &newcfg; + + ASSERT(MUTEX_HELD(&dump_lock)); + + /* + * Free the previously allocated bufs and VM. + */ + if (old->helper != NULL) { + + /* helpers */ + hpend = &old->helper[old->nhelper]; + for (hp = old->helper; hp != hpend; hp++) { + if (hp->lzbuf != NULL) + kmem_free(hp->lzbuf, PAGESIZE); + if (hp->page != NULL) + kmem_free(hp->page, PAGESIZE); + } + kmem_free(old->helper, old->nhelper * sizeof (helper_t)); + + /* VM space for mapping pages */ + cpend = &old->cmap[old->ncmap]; + for (cp = old->cmap; cp != cpend; cp++) + vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); + kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); + + /* output bufs */ + cpend = &old->cbuf[old->ncbuf]; + for (cp = old->cbuf; cp != cpend; cp++) + if (cp->buf != NULL) + kmem_free(cp->buf, cp->size); + kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); + + /* reserved VM for dumpsys_get_maxmem */ + if (old->maxvmsize > 0) + vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); + } + + /* + * Allocate memory and VM. + * One CPU runs dumpsys, the rest are helpers. + */ + new->nhelper = ncpus - 1; + if (new->nhelper < 1) + new->nhelper = 1; + + if (new->nhelper > DUMP_MAX_NHELPER) + new->nhelper = DUMP_MAX_NHELPER; + + /* increase threshold for faster disks */ + new->threshold = dump_plat_mincpu; + if (dumpbuf.iosize >= DUMP_1MB) + new->threshold *= 3; + else if (dumpbuf.iosize >= (256 * DUMP_1KB)) + new->threshold *= 2; + + /* figure compression level based upon the computed threshold. */ + if (dump_plat_mincpu == 0 || new->nhelper < 2) { + new->clevel = 0; + new->nhelper = 1; + } else if ((new->nhelper + 1) >= new->threshold) { + new->clevel = DUMP_CLEVEL_BZIP2; + } else { + new->clevel = DUMP_CLEVEL_LZJB; + } + + if (new->clevel == 0) { + new->ncbuf = 1; + new->ncmap = 1; + } else { + new->ncbuf = NCBUF_PER_HELPER * new->nhelper; + new->ncmap = NCMAP_PER_HELPER * new->nhelper; + } + + /* + * Allocate new data structures and buffers for MINHELPERS, + * and also figure the max desired size. + */ + bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); + new->maxsize = 0; + new->maxvmsize = 0; + new->maxvm = NULL; + tag = 1; + new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); + hpend = &new->helper[new->nhelper]; + for (hp = new->helper; hp != hpend; hp++) { + hp->tag = tag++; + if (hp < &new->helper[MINHELPERS]) { + hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); + hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); + } else if (new->clevel < DUMP_CLEVEL_BZIP2) { + new->maxsize += 2 * PAGESIZE; + } else { + new->maxsize += PAGESIZE; + } + if (new->clevel >= DUMP_CLEVEL_BZIP2) + new->maxsize += bz2size; + } + + new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); + cpend = &new->cbuf[new->ncbuf]; + for (cp = new->cbuf; cp != cpend; cp++) { + cp->state = CBUF_FREEBUF; + cp->size = CBUF_SIZE; + if (cp < &new->cbuf[MINCBUFS]) + cp->buf = kmem_alloc(cp->size, KM_SLEEP); + else + new->maxsize += cp->size; + } + + new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); + cpend = &new->cmap[new->ncmap]; + for (cp = new->cmap; cp != cpend; cp++) { + cp->state = CBUF_FREEMAP; + cp->size = CBUF_MAPSIZE; + cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, + 0, 0, NULL, NULL, VM_SLEEP); + } + + /* reserve VA to be backed with spare pages at crash time */ + if (new->maxsize > 0) { + new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); + new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); + new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, + CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); + } + + /* set new config pointers */ + *old = *new; +} + +/* + * Define a struct memlist walker to optimize bitnum to pfn + * lookup. The walker maintains the state of the list traversal. + */ +typedef struct dumpmlw { + struct memlist *mp; /* current memlist */ + pgcnt_t basenum; /* bitnum base offset */ + pgcnt_t mppages; /* current memlist size */ + pgcnt_t mpleft; /* size to end of current memlist */ + pfn_t mpaddr; /* first pfn in memlist */ +} dumpmlw_t; + +/* initialize the walker */ +static inline void +dump_init_memlist_walker(dumpmlw_t *pw) +{ + pw->mp = phys_install; + pw->basenum = 0; + pw->mppages = pw->mp->size >> PAGESHIFT; + pw->mpleft = pw->mppages; + pw->mpaddr = pw->mp->address >> PAGESHIFT; +} + +/* + * Lookup pfn given bitnum. The memlist can be quite long on some + * systems (e.g.: one per board). To optimize sequential lookups, the + * caller initializes and presents a memlist walker. + */ +static pfn_t +dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) +{ + bitnum -= pw->basenum; + while (pw->mp != NULL) { + if (bitnum < pw->mppages) { + pw->mpleft = pw->mppages - bitnum; + return (pw->mpaddr + bitnum); + } + bitnum -= pw->mppages; + pw->basenum += pw->mppages; + pw->mp = pw->mp->next; + if (pw->mp != NULL) { + pw->mppages = pw->mp->size >> PAGESHIFT; + pw->mpleft = pw->mppages; + pw->mpaddr = pw->mp->address >> PAGESHIFT; + } + } + return (PFN_INVALID); +} + +static pgcnt_t +dump_pfn_to_bitnum(pfn_t pfn) +{ + struct memlist *mp; + pgcnt_t bitnum = 0; + + for (mp = phys_install; mp != NULL; mp = mp->next) { + if (pfn >= (mp->address >> PAGESHIFT) && + pfn < ((mp->address + mp->size) >> PAGESHIFT)) + return (bitnum + pfn - (mp->address >> PAGESHIFT)); + bitnum += mp->size >> PAGESHIFT; + } + return ((pgcnt_t)-1); +} + +/* + * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The + * mapping of pfn to range index is imperfect because pfn and bitnum + * do not have the same phase. To make sure a CBUF_MAPSIZE range is + * covered, call this for both ends: + * dump_set_used(base) + * dump_set_used(base+CBUF_MAPNP-1) + * + * This is used during a panic dump to mark pages allocated by + * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by + * page_get_mnode_freelist() to make sure pages used by dump are never + * allocated. + */ +#define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) + +static void +dump_set_used(pfn_t pfn) +{ + + pgcnt_t bitnum, rbitnum; + + bitnum = dump_pfn_to_bitnum(pfn); + ASSERT(bitnum != (pgcnt_t)-1); + + rbitnum = CBUF_MAPP2R(bitnum); + ASSERT(rbitnum < dumpcfg.rbitmapsize); + + BT_SET(dumpcfg.rbitmap, rbitnum); +} + +int +dump_test_used(pfn_t pfn) +{ + pgcnt_t bitnum, rbitnum; + + bitnum = dump_pfn_to_bitnum(pfn); + ASSERT(bitnum != (pgcnt_t)-1); + + rbitnum = CBUF_MAPP2R(bitnum); + ASSERT(rbitnum < dumpcfg.rbitmapsize); + + return (BT_TEST(dumpcfg.rbitmap, rbitnum)); +} + +/* + * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. + * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). + */ +static void * +dumpbzalloc(void *opaque, int items, int size) +{ + size_t *sz; + char *ret; + + ASSERT(opaque != NULL); + sz = opaque; + ret = dumpcfg.maxvm + *sz; + *sz += items * size; + *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); + ASSERT(*sz <= dumpcfg.maxvmsize); + return (ret); +} + +/*ARGSUSED*/ +static void +dumpbzfree(void *opaque, void *addr) +{ +} + +/* + * Perform additional checks on the page to see if we can really use + * it. The kernel (kas) pages are always set in the bitmap. However, + * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the + * bitmap. So we check for them. + */ +static inline int +dump_pfn_check(pfn_t pfn) +{ + page_t *pp = page_numtopp_nolock(pfn); +#if defined(__sparc) + extern struct vnode prom_ppages; +#endif + + if (pp == NULL || pp->p_pagenum != pfn || +#if defined(__sparc) + pp->p_vnode == &prom_ppages || +#else + PP_ISBOOTPAGES(pp) || +#endif + pp->p_toxic != 0) + return (0); + return (1); +} + +/* + * Check a range to see if all contained pages are available and + * return non-zero if the range can be used. + */ +static inline int +dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) +{ + for (; start < end; start++, pfn++) { + if (BT_TEST(dumpcfg.bitmap, start)) + return (0); + if (!dump_pfn_check(pfn)) + return (0); + } + return (1); +} + +/* + * dumpsys_get_maxmem() is called during panic. Find unused ranges + * and use them for buffers. If we find enough memory switch to + * parallel bzip2, otherwise use parallel lzjb. + * + * It searches the dump bitmap in 2 passes. The first time it looks + * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. + */ +static void +dumpsys_get_maxmem() +{ + dumpcfg_t *cfg = &dumpcfg; + cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; + helper_t *endhp = &cfg->helper[cfg->nhelper]; + pgcnt_t bitnum, end; + size_t sz, endsz, bz2size; + pfn_t pfn, off; + cbuf_t *cp; + helper_t *hp, *ohp; + dumpmlw_t mlw; + int k; + + if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || + (dump_conflags & DUMP_ALL) != 0) + return; + + sz = 0; + cfg->found4m = 0; + cfg->foundsm = 0; + + /* bitmap of ranges used to estimate which pfns are being used */ + bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); + + /* find ranges that are not being dumped to use for buffers */ + dump_init_memlist_walker(&mlw); + for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { + dump_timeleft = dump_timeout; + end = bitnum + CBUF_MAPNP; + pfn = dump_bitnum_to_pfn(bitnum, &mlw); + ASSERT(pfn != PFN_INVALID); + + /* skip partial range at end of mem segment */ + if (mlw.mpleft < CBUF_MAPNP) { + end = bitnum + mlw.mpleft; + continue; + } + + /* skip non aligned pages */ + off = P2PHASE(pfn, CBUF_MAPNP); + if (off != 0) { + end -= off; + continue; + } + + if (!dump_range_check(bitnum, end, pfn)) + continue; + + ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); + hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, + PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); + sz += CBUF_MAPSIZE; + cfg->found4m++; + + /* set the bitmap for both ends to be sure to cover the range */ + dump_set_used(pfn); + dump_set_used(pfn + CBUF_MAPNP - 1); + + if (sz >= cfg->maxsize) + goto foundmax; + } + + /* Add small pages if we can't find enough large pages. */ + dump_init_memlist_walker(&mlw); + for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { + dump_timeleft = dump_timeout; + end = bitnum + CBUF_MAPNP; + pfn = dump_bitnum_to_pfn(bitnum, &mlw); + ASSERT(pfn != PFN_INVALID); + + /* Find any non-aligned pages at start and end of segment. */ + off = P2PHASE(pfn, CBUF_MAPNP); + if (mlw.mpleft < CBUF_MAPNP) { + end = bitnum + mlw.mpleft; + } else if (off != 0) { + end -= off; + } else if (cfg->found4m && dump_test_used(pfn)) { + continue; + } + + for (; bitnum < end; bitnum++, pfn++) { + dump_timeleft = dump_timeout; + if (BT_TEST(dumpcfg.bitmap, bitnum)) + continue; + if (!dump_pfn_check(pfn)) + continue; + ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); + hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, + PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); + sz += PAGESIZE; + cfg->foundsm++; + dump_set_used(pfn); + if (sz >= cfg->maxsize) + goto foundmax; + } + } + + /* Fall back to lzjb if we did not get enough memory for bzip2. */ + endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; + if (sz < endsz) { + cfg->clevel = DUMP_CLEVEL_LZJB; + } + + /* Allocate memory for as many helpers as we can. */ +foundmax: + + /* Byte offsets into memory found and mapped above */ + endsz = sz; + sz = 0; + + /* Set the size for bzip2 state. Only bzip2 needs it. */ + bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); + + /* Skip the preallocate output buffers. */ + cp = &cfg->cbuf[MINCBUFS]; + + /* Use this to move memory up from the preallocated helpers. */ + ohp = cfg->helper; + + /* Loop over all helpers and allocate memory. */ + for (hp = cfg->helper; hp < endhp; hp++) { + + /* Skip preallocated helpers by checking hp->page. */ + if (hp->page == NULL) { + if (cfg->clevel <= DUMP_CLEVEL_LZJB) { + /* lzjb needs 2 1-page buffers */ + if ((sz + (2 * PAGESIZE)) > endsz) + break; + hp->page = cfg->maxvm + sz; + sz += PAGESIZE; + hp->lzbuf = cfg->maxvm + sz; + sz += PAGESIZE; + + } else if (ohp->lzbuf != NULL) { + /* re-use the preallocted lzjb page for bzip2 */ + hp->page = ohp->lzbuf; + ohp->lzbuf = NULL; + ++ohp; + + } else { + /* bzip2 needs a 1-page buffer */ + if ((sz + PAGESIZE) > endsz) + break; + hp->page = cfg->maxvm + sz; + sz += PAGESIZE; + } + } + + /* + * Add output buffers per helper. The number of + * buffers per helper is determined by the ratio of + * ncbuf to nhelper. + */ + for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && + k < NCBUF_PER_HELPER; k++) { + cp->state = CBUF_FREEBUF; + cp->size = CBUF_SIZE; + cp->buf = cfg->maxvm + sz; + sz += CBUF_SIZE; + ++cp; + } + + /* + * bzip2 needs compression state. Use the dumpbzalloc + * and dumpbzfree callbacks to allocate the memory. + * bzip2 does allocation only at init time. + */ + if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { + if ((sz + bz2size) > endsz) { + hp->page = NULL; + break; + } else { + hp->bzstream.opaque = &sz; + hp->bzstream.bzalloc = dumpbzalloc; + hp->bzstream.bzfree = dumpbzfree; + (void) BZ2_bzCompressInit(&hp->bzstream, + dump_bzip2_level, 0, 0); + hp->bzstream.opaque = NULL; + } + } + } + + /* Finish allocating output buffers */ + for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { + cp->state = CBUF_FREEBUF; + cp->size = CBUF_SIZE; + cp->buf = cfg->maxvm + sz; + sz += CBUF_SIZE; + } + + /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ + if (cfg->found4m || cfg->foundsm) + dump_check_used = 1; + + ASSERT(sz <= endsz); +} + static void dumphdr_init(void) { @@ -163,22 +1056,31 @@ dumphdr_init(void) dumphdr->dump_pagesize = PAGESIZE; dumphdr->dump_utsname = utsname; (void) strcpy(dumphdr->dump_platform, platform); - dump_cmap = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); - dumpbuf_size = dumpbuf_iosize(maxphys); - dumpbuf_start = kmem_alloc(dumpbuf_size, KM_SLEEP); - dumpbuf_end = dumpbuf_start + dumpbuf_size; - dump_cbuf = kmem_alloc(PAGESIZE, KM_SLEEP); /* compress buf */ - dump_uebuf = kmem_alloc(PAGESIZE, KM_SLEEP); /* UE buf */ - dump_pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); + dumpbuf.size = dumpbuf_iosize(maxphys); + dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); + dumpbuf.end = dumpbuf.start + dumpbuf.size; + dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); + dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); + LOCK_INIT_HELD(&dumpcfg.helper_lock); } npages = num_phys_pages(); - if (dump_bitmapsize != npages) { + if (dumpcfg.bitmapsize != npages) { + size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); - kmem_free(dump_bitmap, BT_SIZEOFMAP(dump_bitmapsize)); - dump_bitmap = map; - dump_bitmapsize = npages; + void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); + + if (dumpcfg.bitmap != NULL) + kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. + bitmapsize)); + if (dumpcfg.rbitmap != NULL) + kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. + rbitmapsize)); + dumpcfg.bitmap = map; + dumpcfg.bitmapsize = npages; + dumpcfg.rbitmap = rmap; + dumpcfg.rbitmapsize = rlen; } } @@ -246,7 +1148,7 @@ dumpinit(vnode_t *vp, char *name, int justchecking) dumpvp_size = vattr.va_size & -DUMP_OFFSET; dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); (void) strcpy(dumppath, name); - dump_iosize = 0; + dumpbuf.iosize = 0; /* * If the dump device is a block device, attempt to open up the @@ -270,7 +1172,7 @@ dumpinit(vnode_t *vp, char *name, int justchecking) if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, FKIOCTL, kcred, NULL, NULL) == 0) { - dump_iosize = dki.dki_maxtransfer * blk_size; + dumpbuf.iosize = dki.dki_maxtransfer * blk_size; dumpbuf_resize(); } /* @@ -295,6 +1197,8 @@ dumpinit(vnode_t *vp, char *name, int justchecking) cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); + dump_update_clevel(); + return (error); } @@ -341,70 +1245,62 @@ dumpfini(void) dumppath = NULL; } -static pfn_t -dump_bitnum_to_pfn(pgcnt_t bitnum) -{ - struct memlist *mp; - - for (mp = phys_install; mp != NULL; mp = mp->next) { - if (bitnum < (mp->size >> PAGESHIFT)) - return ((mp->address >> PAGESHIFT) + bitnum); - bitnum -= mp->size >> PAGESHIFT; - } - return (PFN_INVALID); -} - -static pgcnt_t -dump_pfn_to_bitnum(pfn_t pfn) -{ - struct memlist *mp; - pgcnt_t bitnum = 0; - - for (mp = phys_install; mp != NULL; mp = mp->next) { - if (pfn >= (mp->address >> PAGESHIFT) && - pfn < ((mp->address + mp->size) >> PAGESHIFT)) - return (bitnum + pfn - (mp->address >> PAGESHIFT)); - bitnum += mp->size >> PAGESHIFT; - } - return ((pgcnt_t)-1); -} - static offset_t dumpvp_flush(void) { - size_t size = P2ROUNDUP(dumpbuf_cur - dumpbuf_start, PAGESIZE); + size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); + hrtime_t iotime; int err; - if (dumpvp_off + size > dumpvp_limit) { + if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { dump_ioerr = ENOSPC; + dumpbuf.vp_off = dumpbuf.vp_limit; } else if (size != 0) { + iotime = gethrtime(); + dumpsync.iowait += iotime - dumpsync.iowaitts; if (panicstr) - err = VOP_DUMP(dumpvp, dumpbuf_start, - lbtodb(dumpvp_off), btod(size), NULL); + err = VOP_DUMP(dumpvp, dumpbuf.start, + lbtodb(dumpbuf.vp_off), btod(size), NULL); else - err = vn_rdwr(UIO_WRITE, dumpvp, dumpbuf_start, size, - dumpvp_off, UIO_SYSSPACE, 0, dumpvp_limit, + err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? + dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, + dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, kcred, 0); if (err && dump_ioerr == 0) dump_ioerr = err; + dumpsync.iowaitts = gethrtime(); + dumpsync.iotime += dumpsync.iowaitts - iotime; + dumpsync.nwrite += size; + dumpbuf.vp_off += size; } - dumpvp_off += size; - dumpbuf_cur = dumpbuf_start; + dumpbuf.cur = dumpbuf.start; dump_timeleft = dump_timeout; - return (dumpvp_off); + return (dumpbuf.vp_off); } +/* maximize write speed by keeping seek offset aligned with size */ void dumpvp_write(const void *va, size_t size) { + size_t len, off, sz; + while (size != 0) { - size_t len = MIN(size, dumpbuf_end - dumpbuf_cur); + len = MIN(size, dumpbuf.end - dumpbuf.cur); if (len == 0) { - (void) dumpvp_flush(); + off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); + if (off == 0 || !ISP2(dumpbuf.size)) { + (void) dumpvp_flush(); + } else { + sz = dumpbuf.size - off; + dumpbuf.cur = dumpbuf.start + sz; + (void) dumpvp_flush(); + ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); + dumpbuf.cur += off; + } } else { - bcopy(va, dumpbuf_cur, len); + bcopy(va, dumpbuf.cur, len); va = (char *)va + len; - dumpbuf_cur += len; + dumpbuf.cur += len; size -= len; } } @@ -427,9 +1323,9 @@ dump_addpage(struct as *as, void *va, pfn_t pfn) pgcnt_t bitnum; if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { - if (!BT_TEST(dump_bitmap, bitnum)) { + if (!BT_TEST(dumpcfg.bitmap, bitnum)) { dumphdr->dump_npages++; - BT_SET(dump_bitmap, bitnum); + BT_SET(dumpcfg.bitmap, bitnum); } dumphdr->dump_nvtop++; mem_vtop.m_as = as; @@ -449,9 +1345,9 @@ dump_page(pfn_t pfn) pgcnt_t bitnum; if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { - if (!BT_TEST(dump_bitmap, bitnum)) { + if (!BT_TEST(dumpcfg.bitmap, bitnum)) { dumphdr->dump_npages++; - BT_SET(dump_bitmap, bitnum); + BT_SET(dumpcfg.bitmap, bitnum); } } dump_timeleft = dump_timeout; @@ -508,10 +1404,10 @@ dump_ereports(void) if (dumpvp == NULL || dumphdr == NULL) return; - dumpbuf_cur = dumpbuf_start; - dumpvp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); - dumpvp_start = dumpvp_limit - DUMP_ERPTSIZE; - dumpvp_off = dumpvp_start; + dumpbuf.cur = dumpbuf.start; + dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); + dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; + dumpbuf.vp_off = dumpvp_start; fm_ereport_dump(); if (panicstr) @@ -523,7 +1419,7 @@ dump_ereports(void) if (!panicstr) { (void) VOP_PUTPAGE(dumpvp, dumpvp_start, - (size_t)(dumpvp_off - dumpvp_start), + (size_t)(dumpbuf.vp_off - dumpvp_start), B_INVAL | B_FORCE, kcred, NULL); } } @@ -539,10 +1435,10 @@ dump_messages(void) if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) return; - dumpbuf_cur = dumpbuf_start; - dumpvp_limit = dumpvp_size - DUMP_OFFSET; - dumpvp_start = dumpvp_limit - DUMP_LOGSIZE; - dumpvp_off = dumpvp_start; + dumpbuf.cur = dumpbuf.start; + dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; + dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; + dumpbuf.vp_off = dumpvp_start; qlast = NULL; do { @@ -566,12 +1462,19 @@ dump_messages(void) (void) dumpvp_flush(); if (!panicstr) { (void) VOP_PUTPAGE(dumpvp, dumpvp_start, - (size_t)(dumpvp_off - dumpvp_start), + (size_t)(dumpbuf.vp_off - dumpvp_start), B_INVAL | B_FORCE, kcred, NULL); } } -static void +/* + * The following functions are called on multiple CPUs during dump. + * They must not use most kernel services, because all cross-calls are + * disabled during panic. Therefore, blocking locks and cache flushes + * will not work. + */ + +static int dump_pagecopy(void *src, void *dst) { long *wsrc = (long *)src; @@ -582,15 +1485,8 @@ dump_pagecopy(void *src, void *dst) on_trap_data_t otd; if (on_trap(&otd, OT_DATA_EC)) { - if (ueoff == -1) { - uint64_t pa; - + if (ueoff == -1) ueoff = w * sizeof (long); - pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, src)) - + ueoff; - cmn_err(CE_WARN, "memory error at PA 0x%08x.%08x", - (uint32_t)(pa >> 32), (uint32_t)pa); - } #ifdef _LP64 wdst[w++] = 0xbadecc00badecc; #else @@ -602,30 +1498,997 @@ dump_pagecopy(void *src, void *dst) w++; } no_trap(); + return (ueoff); +} + +static void +dumpsys_close_cq(cqueue_t *cq, int live) +{ + if (live) { + mutex_enter(&cq->mutex); + atomic_dec_uint(&cq->open); + cv_signal(&cq->cv); + mutex_exit(&cq->mutex); + } else { + atomic_dec_uint(&cq->open); + } +} + +static inline void +dumpsys_spinlock(lock_t *lp) +{ + uint_t backoff = 0; + int loop_count = 0; + + while (LOCK_HELD(lp) || !lock_spin_try(lp)) { + if (++loop_count >= ncpus) { + backoff = mutex_lock_backoff(0); + loop_count = 0; + } else { + backoff = mutex_lock_backoff(backoff); + } + mutex_lock_delay(backoff); + } +} + +static inline void +dumpsys_spinunlock(lock_t *lp) +{ + lock_clear(lp); +} + +static inline void +dumpsys_lock(cqueue_t *cq, int live) +{ + if (live) + mutex_enter(&cq->mutex); + else + dumpsys_spinlock(&cq->spinlock); +} + +static inline void +dumpsys_unlock(cqueue_t *cq, int live, int signal) +{ + if (live) { + if (signal) + cv_signal(&cq->cv); + mutex_exit(&cq->mutex); + } else { + dumpsys_spinunlock(&cq->spinlock); + } +} + +static void +dumpsys_wait_cq(cqueue_t *cq, int live) +{ + if (live) { + cv_wait(&cq->cv, &cq->mutex); + } else { + dumpsys_spinunlock(&cq->spinlock); + while (cq->open) + if (cq->first) + break; + dumpsys_spinlock(&cq->spinlock); + } +} + +static void +dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) +{ + if (cp == NULL) + return; + + dumpsys_lock(cq, live); + + if (cq->ts != 0) { + cq->empty += gethrtime() - cq->ts; + cq->ts = 0; + } + + cp->state = newstate; + cp->next = NULL; + if (cq->last == NULL) + cq->first = cp; + else + cq->last->next = cp; + cq->last = cp; + + dumpsys_unlock(cq, live, 1); +} + +static cbuf_t * +dumpsys_get_cq(cqueue_t *cq, int live) +{ + cbuf_t *cp; + hrtime_t now = gethrtime(); + + dumpsys_lock(cq, live); + + /* CONSTCOND */ + while (1) { + cp = (cbuf_t *)cq->first; + if (cp == NULL) { + if (cq->open == 0) + break; + dumpsys_wait_cq(cq, live); + continue; + } + cq->first = cp->next; + if (cq->first == NULL) { + cq->last = NULL; + cq->ts = now; + } + break; + } + + dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); + return (cp); +} + +/* + * Send an error message to the console. If the main task is running + * just write the message via uprintf. If a helper is running the + * message has to be put on a queue for the main task. Setting fmt to + * NULL means flush the error message buffer. If fmt is not NULL, just + * add the text to the existing buffer. + */ +static void +dumpsys_errmsg(helper_t *hp, const char *fmt, ...) +{ + dumpsync_t *ds = hp->ds; + cbuf_t *cp = hp->cperr; + va_list adx; + + if (hp->helper == MAINHELPER) { + if (fmt != NULL) { + if (ds->neednl) { + uprintf("\n"); + ds->neednl = 0; + } + va_start(adx, fmt); + vuprintf(fmt, adx); + va_end(adx); + } + } else if (fmt == NULL) { + if (cp != NULL) { + CQ_PUT(mainq, cp, CBUF_ERRMSG); + hp->cperr = NULL; + } + } else { + if (hp->cperr == NULL) { + cp = CQ_GET(freebufq); + hp->cperr = cp; + cp->used = 0; + } + va_start(adx, fmt); + cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, + fmt, adx); + va_end(adx); + if ((cp->used + LOG_MSGSIZE) > cp->size) { + CQ_PUT(mainq, cp, CBUF_ERRMSG); + hp->cperr = NULL; + } + } } /* + * Write an output buffer to the dump file. If the main task is + * running just write the data. If a helper is running the output is + * placed on a queue for the main task. + */ +static void +dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) +{ + dumpsync_t *ds = hp->ds; + + if (hp->helper == MAINHELPER) { + HRSTART(ds->perpage, write); + dumpvp_write(cp->buf, used); + HRSTOP(ds->perpage, write); + CQ_PUT(freebufq, cp, CBUF_FREEBUF); + } else { + cp->used = used; + CQ_PUT(mainq, cp, CBUF_WRITE); + } +} + +/* + * Copy one page within the mapped range. The offset starts at 0 and + * is relative to the first pfn. cp->buf + cp->off is the address of + * the first pfn. If dump_pagecopy returns a UE offset, create an + * error message. Returns the offset to the next pfn in the range + * selected by the bitmap. + */ +static int +dumpsys_copy_page(helper_t *hp, int offset) +{ + cbuf_t *cp = hp->cpin; + int ueoff; + + ASSERT(cp->off + offset + PAGESIZE <= cp->size); + ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); + + ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); + + /* ueoff is the offset in the page to a UE error */ + if (ueoff != -1) { + uint64_t pa = ptob(cp->pfn) + offset + ueoff; + + dumpsys_errmsg(hp, "memory error at PA 0x%08x.%08x\n", + (uint32_t)(pa >> 32), (uint32_t)pa); + } + + /* + * Advance bitnum and offset to the next input page for the + * next call to this function. + */ + offset += PAGESIZE; + cp->bitnum++; + while (cp->off + offset < cp->size) { + if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) + break; + offset += PAGESIZE; + cp->bitnum++; + } + + return (offset); +} + +/* + * Read the helper queue, and copy one mapped page. Return 0 when + * done. Return 1 when a page has been copied into hp->page. + */ +static int +dumpsys_sread(helper_t *hp) +{ + dumpsync_t *ds = hp->ds; + + /* CONSTCOND */ + while (1) { + + /* Find the next input buffer. */ + if (hp->cpin == NULL) { + HRSTART(hp->perpage, inwait); + + /* CONSTCOND */ + while (1) { + hp->cpin = CQ_GET(helperq); + dump_timeleft = dump_timeout; + + /* + * NULL return means the helper queue + * is closed and empty. + */ + if (hp->cpin == NULL) + break; + + /* Have input, check for dump I/O error. */ + if (!dump_ioerr) + break; + + /* + * If an I/O error occurs, stay in the + * loop in order to empty the helper + * queue. Return the buffers to the + * main task to unmap and free it. + */ + hp->cpin->used = 0; + CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); + } + HRSTOP(hp->perpage, inwait); + + /* Stop here when the helper queue is closed. */ + if (hp->cpin == NULL) + break; + + /* Set the offset=0 to get the first pfn. */ + hp->in = 0; + + /* Set the total processed to 0 */ + hp->used = 0; + } + + /* Process the next page. */ + if (hp->used < hp->cpin->used) { + + /* + * Get the next page from the input buffer and + * return a copy. + */ + ASSERT(hp->in != -1); + HRSTART(hp->perpage, copy); + hp->in = dumpsys_copy_page(hp, hp->in); + hp->used += PAGESIZE; + HRSTOP(hp->perpage, copy); + break; + + } else { + + /* + * Done with the input. Flush the VM and + * return the buffer to the main task. + */ + if (panicstr && hp->helper != MAINHELPER) + hat_flush_range(kas.a_hat, + hp->cpin->buf, hp->cpin->size); + dumpsys_errmsg(hp, NULL); + CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); + hp->cpin = NULL; + } + } + + return (hp->cpin != NULL); +} + +/* + * Compress size bytes starting at buf with bzip2 + * mode: + * BZ_RUN add one more compressed page + * BZ_FINISH no more input, flush the state + */ +static void +dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) +{ + dumpsync_t *ds = hp->ds; + const int CSIZE = sizeof (dumpcsize_t); + bz_stream *ps = &hp->bzstream; + int rc = 0; + uint32_t csize; + dumpcsize_t cs; + + /* Set input pointers to new input page */ + if (size > 0) { + ps->avail_in = size; + ps->next_in = buf; + } + + /* CONSTCOND */ + while (1) { + + /* Quit when all input has been consumed */ + if (ps->avail_in == 0 && mode == BZ_RUN) + break; + + /* Get a new output buffer */ + if (hp->cpout == NULL) { + HRSTART(hp->perpage, outwait); + hp->cpout = CQ_GET(freebufq); + HRSTOP(hp->perpage, outwait); + ps->avail_out = hp->cpout->size - CSIZE; + ps->next_out = hp->cpout->buf + CSIZE; + } + + /* Compress input, or finalize */ + HRSTART(hp->perpage, compress); + rc = BZ2_bzCompress(ps, mode); + HRSTOP(hp->perpage, compress); + + /* Check for error */ + if (mode == BZ_RUN && rc != BZ_RUN_OK) { + dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", + hp->helper, BZ2_bzErrorString(rc), + hp->cpin->pagenum); + break; + } + + /* Write the buffer if it is full, or we are flushing */ + if (ps->avail_out == 0 || mode == BZ_FINISH) { + csize = hp->cpout->size - CSIZE - ps->avail_out; + cs = DUMP_SET_TAG(csize, hp->tag); + if (csize > 0) { + (void) memcpy(hp->cpout->buf, &cs, CSIZE); + dumpsys_swrite(hp, hp->cpout, csize + CSIZE); + hp->cpout = NULL; + } + } + + /* Check for final complete */ + if (mode == BZ_FINISH) { + if (rc == BZ_STREAM_END) + break; + if (rc != BZ_FINISH_OK) { + dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", + hp->helper, BZ2_bzErrorString(rc)); + break; + } + } + } + + /* Cleanup state and buffers */ + if (mode == BZ_FINISH) { + + /* Reset state so that it is re-usable. */ + (void) BZ2_bzCompressReset(&hp->bzstream); + + /* Give any unused outout buffer to the main task */ + if (hp->cpout != NULL) { + hp->cpout->used = 0; + CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); + hp->cpout = NULL; + } + } +} + +static void +dumpsys_bz2compress(helper_t *hp) +{ + dumpsync_t *ds = hp->ds; + dumpstreamhdr_t sh; + + (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); + sh.stream_pagenum = (pgcnt_t)-1; + sh.stream_npages = 0; + hp->cpin = NULL; + hp->cpout = NULL; + hp->cperr = NULL; + hp->in = 0; + hp->out = 0; + hp->bzstream.avail_in = 0; + + /* Bump reference to mainq while we are running */ + CQ_OPEN(mainq); + + /* Get one page at a time */ + while (dumpsys_sread(hp)) { + if (sh.stream_pagenum != hp->cpin->pagenum) { + sh.stream_pagenum = hp->cpin->pagenum; + sh.stream_npages = btop(hp->cpin->used); + dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); + } + dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); + } + + /* Done with input, flush any partial buffer */ + if (sh.stream_pagenum != (pgcnt_t)-1) { + dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); + dumpsys_errmsg(hp, NULL); + } + + ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); + + /* Decrement main queue count, we are done */ + CQ_CLOSE(mainq); +} + +/* + * Compress with lzjb + * write stream block if full or size==0 + * if csize==0 write stream header, else write <csize, data> + * size==0 is a call to flush a buffer + * hp->cpout is the buffer we are flushing or filling + * hp->out is the next index to fill data + * osize is either csize+data, or the size of a stream header + */ +static void +dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) +{ + dumpsync_t *ds = hp->ds; + const int CSIZE = sizeof (dumpcsize_t); + dumpcsize_t cs; + size_t osize = csize > 0 ? CSIZE + size : size; + + /* If flush, and there is no buffer, just return */ + if (size == 0 && hp->cpout == NULL) + return; + + /* If flush, or cpout is full, write it out */ + if (size == 0 || + hp->cpout != NULL && hp->out + osize > hp->cpout->size) { + + /* Set tag+size word at the front of the stream block. */ + cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); + (void) memcpy(hp->cpout->buf, &cs, CSIZE); + + /* Write block to dump file. */ + dumpsys_swrite(hp, hp->cpout, hp->out); + + /* Clear pointer to indicate we need a new buffer */ + hp->cpout = NULL; + + /* flushing, we are done */ + if (size == 0) + return; + } + + /* Get an output buffer if we dont have one. */ + if (hp->cpout == NULL) { + HRSTART(hp->perpage, outwait); + hp->cpout = CQ_GET(freebufq); + HRSTOP(hp->perpage, outwait); + hp->out = CSIZE; + } + + /* Store csize word. This is the size of compressed data. */ + if (csize > 0) { + cs = DUMP_SET_TAG(csize, 0); + (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); + hp->out += CSIZE; + } + + /* Store the data. */ + (void) memcpy(hp->cpout->buf + hp->out, buf, size); + hp->out += size; +} + +static void +dumpsys_lzjbcompress(helper_t *hp) +{ + dumpsync_t *ds = hp->ds; + size_t csize; + dumpstreamhdr_t sh; + + (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); + sh.stream_pagenum = (pfn_t)-1; + sh.stream_npages = 0; + hp->cpin = NULL; + hp->cpout = NULL; + hp->cperr = NULL; + hp->in = 0; + hp->out = 0; + + /* Bump reference to mainq while we are running */ + CQ_OPEN(mainq); + + /* Get one page at a time */ + while (dumpsys_sread(hp)) { + + /* Create a stream header for each new input map */ + if (sh.stream_pagenum != hp->cpin->pagenum) { + sh.stream_pagenum = hp->cpin->pagenum; + sh.stream_npages = btop(hp->cpin->used); + dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); + } + + /* Compress one page */ + HRSTART(hp->perpage, compress); + csize = compress(hp->page, hp->lzbuf, PAGESIZE); + HRSTOP(hp->perpage, compress); + + /* Add csize+data to output block */ + ASSERT(csize > 0 && csize <= PAGESIZE); + dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); + } + + /* Done with input, flush any partial buffer */ + if (sh.stream_pagenum != (pfn_t)-1) { + dumpsys_lzjbrun(hp, 0, NULL, 0); + dumpsys_errmsg(hp, NULL); + } + + ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); + + /* Decrement main queue count, we are done */ + CQ_CLOSE(mainq); +} + +/* + * Dump helper called from panic_idle() to compress pages. CPUs in + * this path must not call most kernel services. + * + * During panic, all but one of the CPUs is idle. These CPUs are used + * as helpers working in parallel to copy and compress memory + * pages. During a panic, however, these processors cannot call any + * kernel services. This is because mutexes become no-ops during + * panic, and, cross-call interrupts are inhibited. Therefore, during + * panic dump the helper CPUs communicate with the panic CPU using + * memory variables. All memory mapping and I/O is performed by the + * panic CPU. + */ +void +dumpsys_helper() +{ + dumpsys_spinlock(&dumpcfg.helper_lock); + if (dumpcfg.helpers_wanted) { + helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; + + for (hp = dumpcfg.helper; hp != hpend; hp++) { + if (hp->helper == FREEHELPER) { + hp->helper = CPU->cpu_id; + BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); + + dumpsys_spinunlock(&dumpcfg.helper_lock); + + if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) + dumpsys_lzjbcompress(hp); + else + dumpsys_bz2compress(hp); + + hp->helper = DONEHELPER; + return; + } + } + } + dumpsys_spinunlock(&dumpcfg.helper_lock); +} + +/* + * Dump helper for live dumps. + * These run as a system task. + */ +static void +dumpsys_live_helper(void *arg) +{ + helper_t *hp = arg; + + BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); + if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) + dumpsys_lzjbcompress(hp); + else + dumpsys_bz2compress(hp); +} + +/* + * Compress one page with lzjb (single threaded case) + */ +static void +dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) +{ + dumpsync_t *ds = hp->ds; + uint32_t csize; + + hp->helper = MAINHELPER; + hp->in = 0; + hp->used = 0; + hp->cpin = cp; + while (hp->used < cp->used) { + HRSTART(hp->perpage, copy); + hp->in = dumpsys_copy_page(hp, hp->in); + hp->used += PAGESIZE; + HRSTOP(hp->perpage, copy); + + HRSTART(hp->perpage, compress); + csize = compress(hp->page, hp->lzbuf, PAGESIZE); + HRSTOP(hp->perpage, compress); + + HRSTART(hp->perpage, write); + dumpvp_write(&csize, sizeof (csize)); + dumpvp_write(hp->lzbuf, csize); + HRSTOP(hp->perpage, write); + } + CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); + hp->cpin = NULL; +} + +/* + * Main task to dump pages. This is called on the dump CPU. + */ +static void +dumpsys_main_task(void *arg) +{ + dumpsync_t *ds = arg; + pgcnt_t pagenum = 0, bitnum = 0, hibitnum; + dumpmlw_t mlw; + cbuf_t *cp; + pgcnt_t baseoff, pfnoff; + pfn_t base, pfn; + int sec; + + dump_init_memlist_walker(&mlw); + + /* CONSTCOND */ + while (1) { + + if (ds->percent > ds->percent_done) { + ds->percent_done = ds->percent; + sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; + uprintf("^\r%2d:%02d %3d%% done", + sec / 60, sec % 60, ds->percent); + ds->neednl = 1; + } + + while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { + + /* the writerq never blocks */ + cp = CQ_GET(writerq); + if (cp == NULL) + break; + + dump_timeleft = dump_timeout; + + HRSTART(ds->perpage, write); + dumpvp_write(cp->buf, cp->used); + HRSTOP(ds->perpage, write); + + CQ_PUT(freebufq, cp, CBUF_FREEBUF); + } + + /* + * Wait here for some buffers to process. Returns NULL + * when all helpers have terminated and all buffers + * have been processed. + */ + cp = CQ_GET(mainq); + + if (cp == NULL) { + + /* Drain the write queue. */ + if (!CQ_IS_EMPTY(writerq)) + continue; + + /* Main task exits here. */ + break; + } + + dump_timeleft = dump_timeout; + + switch (cp->state) { + + case CBUF_FREEMAP: + + /* + * Note that we drop CBUF_FREEMAP buffers on + * the floor (they will not be on any cqueue) + * when we no longer need them. + */ + if (bitnum >= dumpcfg.bitmapsize) + break; + + if (dump_ioerr) { + bitnum = dumpcfg.bitmapsize; + CQ_CLOSE(helperq); + break; + } + + HRSTART(ds->perpage, bitmap); + for (; bitnum < dumpcfg.bitmapsize; bitnum++) + if (BT_TEST(dumpcfg.bitmap, bitnum)) + break; + HRSTOP(ds->perpage, bitmap); + dump_timeleft = dump_timeout; + + if (bitnum >= dumpcfg.bitmapsize) { + CQ_CLOSE(helperq); + break; + } + + /* + * Try to map CBUF_MAPSIZE ranges. Can't + * assume that memory segment size is a + * multiple of CBUF_MAPSIZE. Can't assume that + * the segment starts on a CBUF_MAPSIZE + * boundary. + */ + pfn = dump_bitnum_to_pfn(bitnum, &mlw); + ASSERT(pfn != PFN_INVALID); + ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); + + base = P2ALIGN(pfn, CBUF_MAPNP); + if (base < mlw.mpaddr) { + base = mlw.mpaddr; + baseoff = P2PHASE(base, CBUF_MAPNP); + } else { + baseoff = 0; + } + + pfnoff = pfn - base; + if (pfnoff + mlw.mpleft < CBUF_MAPNP) { + hibitnum = bitnum + mlw.mpleft; + cp->size = ptob(pfnoff + mlw.mpleft); + } else { + hibitnum = bitnum - pfnoff + CBUF_MAPNP - + baseoff; + cp->size = CBUF_MAPSIZE - ptob(baseoff); + } + + cp->pfn = pfn; + cp->bitnum = bitnum++; + cp->pagenum = pagenum++; + cp->off = ptob(pfnoff); + + for (; bitnum < hibitnum; bitnum++) + if (BT_TEST(dumpcfg.bitmap, bitnum)) + pagenum++; + + dump_timeleft = dump_timeout; + cp->used = ptob(pagenum - cp->pagenum); + + HRSTART(ds->perpage, map); + hat_devload(kas.a_hat, cp->buf, cp->size, base, + PROT_READ, HAT_LOAD_NOCONSIST); + HRSTOP(ds->perpage, map); + + ds->pages_mapped += btop(cp->size); + ds->pages_used += pagenum - cp->pagenum; + + CQ_OPEN(mainq); + + /* + * If there are no helpers the main task does + * non-streams lzjb compress. + */ + if (dumpcfg.clevel == 0) { + dumpsys_lzjb_page(dumpcfg.helper, cp); + break; + } + + /* pass mapped pages to a helper */ + CQ_PUT(helperq, cp, CBUF_INREADY); + + /* the last page was done */ + if (bitnum >= dumpcfg.bitmapsize) + CQ_CLOSE(helperq); + + break; + + case CBUF_USEDMAP: + + ds->npages += btop(cp->used); + + HRSTART(ds->perpage, unmap); + hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); + HRSTOP(ds->perpage, unmap); + + if (bitnum < dumpcfg.bitmapsize) + CQ_PUT(mainq, cp, CBUF_FREEMAP); + CQ_CLOSE(mainq); + + ASSERT(ds->npages <= dumphdr->dump_npages); + ds->percent = ds->npages * 100LL / dumphdr->dump_npages; + break; + + case CBUF_WRITE: + + CQ_PUT(writerq, cp, CBUF_WRITE); + break; + + case CBUF_ERRMSG: + + if (cp->used > 0) { + cp->buf[cp->size - 2] = '\n'; + cp->buf[cp->size - 1] = '\0'; + if (ds->neednl) { + uprintf("\n%s", cp->buf); + ds->neednl = 0; + } else { + uprintf("%s", cp->buf); + } + } + CQ_PUT(freebufq, cp, CBUF_FREEBUF); + break; + + default: + uprintf("dump: unexpected buffer state %d, " + "buffer will be lost\n", cp->state); + break; + + } /* end switch */ + + } /* end while(1) */ +} + +#ifdef COLLECT_METRICS +size_t +dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) +{ + dumpcfg_t *cfg = &dumpcfg; + int myid = CPU->cpu_seqid; + int i, compress_ratio; + int sec, iorate; + helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; + char *e = buf + size; + char *p = buf; + + sec = ds->elapsed / (1000 * 1000 * 1000ULL); + if (sec < 1) + sec = 1; + + if (ds->iotime < 1) + ds->iotime = 1; + iorate = (ds->nwrite * 100000ULL) / ds->iotime; + + compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); + +#define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) + + P("Master cpu_seqid,%d\n", CPU->cpu_seqid); + P("Master cpu_id,%d\n", CPU->cpu_id); + P("dump_flags,0x%x\n", dumphdr->dump_flags); + P("dump_ioerr,%d\n", dump_ioerr); + + P("Helpers:\n"); + for (i = 0; i < ncpus; i++) { + if ((i & 15) == 0) + P(",,%03d,", i); + if (i == myid) + P(" M"); + else if (BT_TEST(cfg->helpermap, i)) + P("%4d", cpu_seq[i]->cpu_id); + else + P(" *"); + if ((i & 15) == 15) + P("\n"); + } + + P("ncbuf_used,%d\n", cfg->ncbuf_used); + P("ncmap,%d\n", cfg->ncmap); + + P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); + P("Found small pages,%ld\n", cfg->foundsm); + + P("Compression level,%d\n", cfg->clevel); + P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", + cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); + P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % + 100); + P("nhelper_used,%d\n", cfg->nhelper_used); + + P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); + P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); + P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); + P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); + P("dumpbuf.size,%ld\n", dumpbuf.size); + + P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); + P("Dump pages,%llu\n", (u_longlong_t)ds->npages); + P("Dump time,%d\n", sec); + + if (ds->pages_mapped > 0) + P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) + / ds->pages_mapped)); + + P("\nPer-page metrics:\n"); + if (ds->npages > 0) { + for (hp = cfg->helper; hp != hpend; hp++) { +#define PERPAGE(x) ds->perpage.x += hp->perpage.x; + PERPAGES; +#undef PERPAGE + } +#define PERPAGE(x) \ + P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); + PERPAGES; +#undef PERPAGE + P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / + ds->npages)); + P("helperq.empty,%d\n", (int)(ds->helperq.empty / + ds->npages)); + P("writerq.empty,%d\n", (int)(ds->writerq.empty / + ds->npages)); + P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); + + P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / + ds->npages)); + } +#undef P + if (p < e) + bzero(p, e - p); + return (p - buf); +} +#endif /* COLLECT_METRICS */ + +/* * Dump the system. */ void dumpsys(void) { + dumpsync_t *ds = &dumpsync; + taskq_t *livetaskq = NULL; pfn_t pfn; pgcnt_t bitnum; - int npages = 0; - int percent_done = 0; - uint32_t csize; - u_offset_t total_csize = 0; - int compress_ratio; proc_t *p; + helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; + cbuf_t *cp; pid_t npids, pidx; char *content; + int save_dump_clevel; + dumpmlw_t mlw; + dumpcsize_t datatag; + dumpdatahdr_t datahdr; if (dumpvp == NULL || dumphdr == NULL) { uprintf("skipping system dump - no dump device configured\n"); + if (panicstr) { + dumpcfg.helpers_wanted = 0; + dumpsys_spinunlock(&dumpcfg.helper_lock); + } return; } - dumpbuf_cur = dumpbuf_start; + dumpbuf.cur = dumpbuf.start; + + /* clear the sync variables */ + ASSERT(dumpcfg.nhelper > 0); + bzero(ds, sizeof (*ds)); + ds->dumpcpu = CPU->cpu_id; /* * Calculate the starting block for dump. If we're dumping on a @@ -637,11 +2500,11 @@ dumpsys(void) else dumphdr->dump_start = DUMP_OFFSET; - dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE; + dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; dumphdr->dump_crashtime = gethrestime_sec(); dumphdr->dump_npages = 0; dumphdr->dump_nvtop = 0; - bzero(dump_bitmap, BT_SIZEOFMAP(dump_bitmapsize)); + bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); dump_timeleft = dump_timeout; if (panicstr) { @@ -650,6 +2513,7 @@ dumpsys(void) (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, panicstr, panicargs); + } if (dump_conflags & DUMP_ALL) @@ -661,17 +2525,45 @@ dumpsys(void) uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, dumphdr->dump_start, content); + /* Make sure nodename is current */ + bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); + + /* + * If this is a live dump, try to open a VCHR vnode for better + * performance. We must take care to flush the buffer cache + * first. + */ + if (!panicstr) { + vnode_t *cdev_vp, *cmn_cdev_vp; + + ASSERT(dumpbuf.cdev_vp == NULL); + cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); + if (cdev_vp != NULL) { + cmn_cdev_vp = common_specvp(cdev_vp); + if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) + == 0) { + if (vn_has_cached_data(dumpvp)) + (void) pvn_vplist_dirty(dumpvp, 0, NULL, + B_INVAL | B_TRUNC, kcred); + dumpbuf.cdev_vp = cmn_cdev_vp; + } else { + VN_RELE(cdev_vp); + } + } + } + /* * Leave room for the message and ereport save areas and terminal dump * header. */ - dumpvp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - DUMP_ERPTSIZE; + dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - + DUMP_ERPTSIZE; /* * Write out the symbol table. It's no longer compressed, * so its 'size' and 'csize' are equal. */ - dumpvp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; + dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); @@ -692,18 +2584,18 @@ dumpsys(void) mutex_enter(&pidlock); for (npids = 0, p = practive; p != NULL; p = p->p_next) - dump_pids[npids++] = p->p_pid; + dumpcfg.pids[npids++] = p->p_pid; mutex_exit(&pidlock); for (pidx = 0; pidx < npids; pidx++) - (void) dump_process(dump_pids[pidx]); + (void) dump_process(dumpcfg.pids[pidx]); - for (bitnum = 0; bitnum < dump_bitmapsize; bitnum++) { + for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { dump_timeleft = dump_timeout; - BT_SET(dump_bitmap, bitnum); + BT_SET(dumpcfg.bitmap, bitnum); } - dumphdr->dump_npages = dump_bitmapsize; + dumphdr->dump_npages = dumpcfg.bitmapsize; dumphdr->dump_flags |= DF_ALL; } else if (dump_conflags & DUMP_CURPROC) { @@ -718,14 +2610,14 @@ dumpsys(void) if (panic_thread != NULL && panic_thread->t_procp != NULL && panic_thread->t_procp != &p0) { - dump_pids[npids++] = + dumpcfg.pids[npids++] = panic_thread->t_procp->p_pid; } } else { - dump_pids[npids++] = curthread->t_procp->p_pid; + dumpcfg.pids[npids++] = curthread->t_procp->p_pid; } - if (npids && dump_process(dump_pids[0]) == 0) + if (npids && dump_process(dumpcfg.pids[0]) == 0) dumphdr->dump_flags |= DF_CURPROC; else dumphdr->dump_flags |= DF_KERNEL; @@ -740,11 +2632,12 @@ dumpsys(void) * Write out the pfn table. */ dumphdr->dump_pfn = dumpvp_flush(); - for (bitnum = 0; bitnum < dump_bitmapsize; bitnum++) { + dump_init_memlist_walker(&mlw); + for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { dump_timeleft = dump_timeout; - if (!BT_TEST(dump_bitmap, bitnum)) + if (!BT_TEST(dumpcfg.bitmap, bitnum)) continue; - pfn = dump_bitnum_to_pfn(bitnum); + pfn = dump_bitnum_to_pfn(bitnum, &mlw); ASSERT(pfn != PFN_INVALID); dumpvp_write(&pfn, sizeof (pfn_t)); } @@ -752,67 +2645,144 @@ dumpsys(void) /* * Write out all the pages. + * Map pages, copy them handling UEs, compress, and write them out. + * Cooperate with any helpers running on CPUs in panic_idle(). */ dumphdr->dump_data = dumpvp_flush(); - for (bitnum = 0; bitnum < dump_bitmapsize; bitnum++) { - dump_timeleft = dump_timeout; - if (!BT_TEST(dump_bitmap, bitnum)) + + bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); + ds->live = dumpcfg.clevel > 0 && + (dumphdr->dump_flags & DF_LIVE) != 0; + + save_dump_clevel = dumpcfg.clevel; + if (panicstr) + dumpsys_get_maxmem(); + else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) + dumpcfg.clevel = DUMP_CLEVEL_LZJB; + + dumpcfg.nhelper_used = 0; + for (hp = dumpcfg.helper; hp != hpend; hp++) { + if (hp->page == NULL) { + hp->helper = DONEHELPER; continue; - pfn = dump_bitnum_to_pfn(bitnum); - ASSERT(pfn != PFN_INVALID); + } + ++dumpcfg.nhelper_used; + hp->helper = FREEHELPER; + hp->taskqid = NULL; + hp->ds = ds; + bzero(&hp->perpage, sizeof (hp->perpage)); + if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) + (void) BZ2_bzCompressReset(&hp->bzstream); + } - /* - * Map in page frame 'pfn', scan it for UE's while copying - * the data to dump_uebuf, unmap it, compress dump_uebuf into - * dump_cbuf, and write out dump_cbuf. The UE check ensures - * that we don't lose the whole dump because of a latent UE. - */ - hat_devload(kas.a_hat, dump_cmap, PAGESIZE, pfn, PROT_READ, - HAT_LOAD_NOCONSIST); - dump_pagecopy(dump_cmap, dump_uebuf); - hat_unload(kas.a_hat, dump_cmap, PAGESIZE, HAT_UNLOAD); - csize = (uint32_t)compress(dump_uebuf, dump_cbuf, PAGESIZE); - dumpvp_write(&csize, sizeof (uint32_t)); - dumpvp_write(dump_cbuf, csize); - if (dump_ioerr) { - dumphdr->dump_flags &= ~DF_COMPLETE; - dumphdr->dump_npages = npages; - break; + CQ_OPEN(freebufq); + CQ_OPEN(helperq); + + dumpcfg.ncbuf_used = 0; + for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { + if (cp->buf != NULL) { + CQ_PUT(freebufq, cp, CBUF_FREEBUF); + ++dumpcfg.ncbuf_used; } - total_csize += csize; - if (++npages * 100LL / dumphdr->dump_npages > percent_done) { - uprintf("^\r%3d%% done", ++percent_done); - if (!panicstr) - delay(1); /* let the output be sent */ + } + + for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) + CQ_PUT(mainq, cp, CBUF_FREEMAP); + + ds->start = gethrtime(); + ds->iowaitts = ds->start; + + /* start helpers */ + if (ds->live) { + int n = dumpcfg.nhelper_used; + int pri = MINCLSYSPRI - 25; + + livetaskq = taskq_create("LiveDump", n, pri, n, n, + TASKQ_PREPOPULATE); + for (hp = dumpcfg.helper; hp != hpend; hp++) { + if (hp->page == NULL) + continue; + hp->helper = hp - dumpcfg.helper; + hp->taskqid = taskq_dispatch(livetaskq, + dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); } + + } else { + dumpcfg.helpers_wanted = dumpcfg.clevel > 0; + dumpsys_spinunlock(&dumpcfg.helper_lock); } - dumphdr->dump_npages += dump_plat_data(dump_cbuf); - (void) dumpvp_flush(); + /* run main task */ + dumpsys_main_task(ds); + + ds->elapsed = gethrtime() - ds->start; + if (ds->elapsed < 1) + ds->elapsed = 1; + + if (livetaskq != NULL) + taskq_destroy(livetaskq); + + if (ds->neednl) { + uprintf("\n"); + ds->neednl = 0; + } + + /* record actual pages dumped */ + dumphdr->dump_npages = ds->npages; + + /* platform-specific data */ + dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); + + /* note any errors by clearing DF_COMPLETE */ + if (dump_ioerr || ds->npages < dumphdr->dump_npages) + dumphdr->dump_flags &= ~DF_COMPLETE; + + /* end of stream blocks */ + datatag = 0; + dumpvp_write(&datatag, sizeof (datatag)); + + /* compression info in data header */ + bzero(&datahdr, sizeof (datahdr)); + datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; + datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; + datahdr.dump_maxcsize = CBUF_SIZE; + datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; + datahdr.dump_nstreams = dumpcfg.nhelper_used; + datahdr.dump_clevel = dumpcfg.clevel; +#ifdef COLLECT_METRICS + if (dump_metrics_on) + datahdr.dump_metrics = dumpsys_metrics(ds, dumpcfg.cbuf[0].buf, + MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - + sizeof (dumpdatahdr_t))); +#endif + datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; /* * Write out the initial and terminal dump headers. */ - dumpvp_off = dumphdr->dump_start; + dumpbuf.vp_off = dumphdr->dump_start; dumpvp_write(dumphdr, sizeof (dumphdr_t)); (void) dumpvp_flush(); - dumpvp_limit = dumpvp_size; - dumpvp_off = dumpvp_limit - DUMP_OFFSET; + dumpbuf.vp_limit = dumpvp_size; + dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; dumpvp_write(dumphdr, sizeof (dumphdr_t)); - (void) dumpvp_flush(); + dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); + dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); - compress_ratio = (int)(100LL * npages / (btopr(total_csize + 1))); + (void) dumpvp_flush(); - uprintf("\r%3d%% done: %d pages dumped, compression ratio %d.%02d, ", - percent_done, npages, compress_ratio / 100, compress_ratio % 100); + uprintf("\r%3d%% done: %llu pages dumped, ", + ds->percent_done, (u_longlong_t)ds->npages); if (dump_ioerr == 0) { uprintf("dump succeeded\n"); } else { uprintf("dump failed: error %d\n", dump_ioerr); - if (panicstr && dumpfaildebug) +#ifdef DEBUG + if (panicstr) debug_enter("dump failed"); +#endif } /* @@ -827,6 +2797,19 @@ dumpsys(void) delay(2 * hz); /* let people see the 'done' message */ dump_timeleft = 0; dump_ioerr = 0; + + /* restore settings after live dump completes */ + if (!panicstr) { + dumpcfg.clevel = save_dump_clevel; + + /* release any VCHR open of the dump device */ + if (dumpbuf.cdev_vp != NULL) { + (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, + kcred, NULL); + VN_RELE(dumpbuf.cdev_vp); + dumpbuf.cdev_vp = NULL; + } + } } /* @@ -839,6 +2822,7 @@ dump_resize() mutex_enter(&dump_lock); dumphdr_init(); dumpbuf_resize(); + dump_update_clevel(); mutex_exit(&dump_lock); } diff --git a/usr/src/uts/common/sys/dumphdr.h b/usr/src/uts/common/sys/dumphdr.h index 72c6e41c71..1dd57a0cec 100644 --- a/usr/src/uts/common/sys/dumphdr.h +++ b/usr/src/uts/common/sys/dumphdr.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DUMPHDR_H #define _SYS_DUMPHDR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/utsname.h> @@ -87,6 +85,7 @@ typedef struct dumphdr { #define DF_VALID 0x00000001 /* Dump is valid (savecore clears) */ #define DF_COMPLETE 0x00000002 /* All pages present as configured */ #define DF_LIVE 0x00000004 /* Dump was taken on a live system */ +#define DF_COMPRESSED 0x00000008 /* Dump is compressed */ #define DF_KERNEL 0x00010000 /* Contains kernel pages only */ #define DF_ALL 0x00020000 /* Contains all pages */ #define DF_CURPROC 0x00040000 /* Contains kernel + cur proc pages */ @@ -110,6 +109,58 @@ typedef struct dump_map { ((((uintptr_t)(as) >> 3) + ((va) >> (dhp)->dump_pageshift)) & \ (dhp)->dump_hashmask) +/* + * Encoding of the csize word used to provide meta information + * between dumpsys and savecore. + * + * tag size + * 1-4095 1..dump_maxcsize stream block + * 0 1..pagesize one lzjb page + * 0 0 marks end of data + */ +typedef uint32_t dumpcsize_t; + +#define DUMP_MAX_TAG (0xfffU) +#define DUMP_MAX_CSIZE (0xfffffU) +#define DUMP_SET_TAG(w, v) (((w) & DUMP_MAX_CSIZE) | ((v) << 20)) +#define DUMP_GET_TAG(w) (((w) >> 20) & DUMP_MAX_TAG) +#define DUMP_SET_CSIZE(w, v) \ + (((w) & (DUMP_MAX_TAG << 20)) | ((v) & DUMP_MAX_CSIZE)) +#define DUMP_GET_CSIZE(w) ((w) & DUMP_MAX_CSIZE) + +typedef struct dumpstreamhdr { + char stream_magic[8]; /* "StrmHdr" */ + pgcnt_t stream_pagenum; /* starting pfn */ + pgcnt_t stream_npages; /* uncompressed size */ +} dumpstreamhdr_t; + +#define DUMP_STREAM_MAGIC "StrmHdr" + +/* The number of helpers is limited by the number of stream tags. */ +#define DUMP_MAX_NHELPER DUMP_MAX_TAG + +/* + * The dump data header is placed after the dumphdr in the compressed + * image. It is not needed after savecore runs and the data pages have + * been decompressed. + */ +typedef struct dumpdatahdr { + uint32_t dump_datahdr_magic; /* data header presence */ + uint32_t dump_datahdr_version; /* data header version */ + uint64_t dump_data_csize; /* compressed data size */ + uint32_t dump_maxcsize; /* compressed data max block size */ + uint32_t dump_maxrange; /* max number of pages per range */ + uint16_t dump_nstreams; /* number of compression streams */ + uint16_t dump_clevel; /* compression level (0-9) */ + uint32_t dump_metrics; /* size of metrics data */ +} dumpdatahdr_t; + +#define DUMP_DATAHDR_MAGIC ('d' << 24 | 'h' << 16 | 'd' << 8 | 'r') + +#define DUMP_DATAHDR_VERSION 1 +#define DUMP_CLEVEL_LZJB 1 /* parallel lzjb compression */ +#define DUMP_CLEVEL_BZIP2 2 /* parallel bzip2 level 1 */ + #ifdef _KERNEL extern kmutex_t dump_lock; @@ -131,6 +182,7 @@ extern void dump_resize(void); extern void dump_page(pfn_t); extern void dump_addpage(struct as *, void *, pfn_t); extern void dumpsys(void); +extern void dumpsys_helper(void); extern void dump_messages(void); extern void dump_ereports(void); extern void dumpvp_write(const void *, size_t); @@ -139,6 +191,29 @@ extern int dump_plat_addr(void); extern void dump_plat_pfn(void); extern int dump_plat_data(void *); +/* + * Define a CPU count threshold that determines when to employ + * bzip2. The values are defined per-platform in dump_plat_mincpu, and + * may be changed with /etc/system. The value 0 disables parallelism, + * and the old format dump is produced. + */ +extern uint_t dump_plat_mincpu; + +#define DUMP_PLAT_SUN4U_MINCPU 51 +#define DUMP_PLAT_SUN4U_OPL_MINCPU 8 +#define DUMP_PLAT_SUN4V_MINCPU 128 +#define DUMP_PLAT_X86_64_MINCPU 11 +#define DUMP_PLAT_X86_32_MINCPU 0 + +/* + * Pages may be stolen at dump time. Prevent the pages from ever being + * allocated while dump is running. + */ +#define IS_DUMP_PAGE(pp) (dump_check_used && dump_test_used((pp)->p_pagenum)) + +extern int dump_test_used(pfn_t); +extern int dump_check_used; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index b966acf7fc..bed40f63d9 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -200,6 +200,9 @@ void hat_thread_exit(kthread_t *); * given to the specified virtual protection. If vprot is ~PROT_WRITE, * then remove write permission, leaving the other permissions * unchanged. If vprot is ~PROT_USER, remove user permissions. + * + * void hat_flush_range(hat, addr, size) + * Invalidate a virtual address translation for the local CPU. */ void hat_memload(struct hat *, caddr_t, struct page *, uint_t, uint_t); @@ -218,6 +221,7 @@ void hat_unlock_region(struct hat *, caddr_t, size_t, hat_region_cookie_t); void hat_unload(struct hat *, caddr_t, size_t, uint_t); void hat_unload_callback(struct hat *, caddr_t, size_t, uint_t, hat_callback_t *); +void hat_flush_range(struct hat *, caddr_t, size_t); void hat_sync(struct hat *, caddr_t, size_t, uint_t); void hat_map(struct hat *, caddr_t, size_t, uint_t); void hat_setattr(struct hat *, caddr_t, size_t, uint_t); diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index a6af733be8..43f153f19f 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -4267,6 +4267,8 @@ retry: return (pp); } +#define SYNC_PROGRESS_NPAGES 1000 + /* * Returns a count of dirty pages that are in the process * of being written out. If 'cleanit' is set, try to push the page. @@ -4277,12 +4279,22 @@ page_busy(int cleanit) page_t *page0 = page_first(); page_t *pp = page0; pgcnt_t nppbusy = 0; + int counter = 0; u_offset_t off; do { vnode_t *vp = pp->p_vnode; /* + * Reset the sync timeout. The page list is very long + * on large memory systems. + */ + if (++counter > SYNC_PROGRESS_NPAGES) { + counter = 0; + vfs_syncprogress(); + } + + /* * A page is a candidate for syncing if it is: * * (a) On neither the freelist nor the cachelist @@ -4299,7 +4311,6 @@ page_busy(int cleanit) hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && vfs_can_sync(vp->v_vfsp)) { nppbusy++; - vfs_syncprogress(); if (!cleanit) continue; @@ -4322,6 +4333,7 @@ page_busy(int cleanit) } } while ((pp = page_next(pp)) != page0); + vfs_syncprogress(); return (nppbusy); } diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c index d31ca1bda9..59b4e079c5 100644 --- a/usr/src/uts/common/vm/vm_pagelist.c +++ b/usr/src/uts/common/vm/vm_pagelist.c @@ -59,6 +59,7 @@ #include <sys/callb.h> #include <sys/mem_cage.h> #include <sys/sdt.h> +#include <sys/dumphdr.h> extern uint_t vac_colors; @@ -2951,7 +2952,8 @@ try_again: * page of each large page. */ first_pp = pp; - while (!page_trylock_cons(pp, SE_EXCL)) { + while (!page_trylock_cons(pp, SE_EXCL) || + IS_DUMP_PAGE(pp)) { if (szc == 0) { pp = pp->p_next; } else { |
