diff options
Diffstat (limited to 'src/cmd/prof/main.c')
-rw-r--r-- | src/cmd/prof/main.c | 895 |
1 files changed, 895 insertions, 0 deletions
diff --git a/src/cmd/prof/main.c b/src/cmd/prof/main.c new file mode 100644 index 000000000..f36759cd3 --- /dev/null +++ b/src/cmd/prof/main.c @@ -0,0 +1,895 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <u.h> +#include <time.h> +#include <libc.h> +#include <bio.h> +#include <ctype.h> + +#define Ureg Ureg_amd64 + #include <ureg_amd64.h> +#undef Ureg +#define Ureg Ureg_x86 + #include <ureg_x86.h> +#undef Ureg +#include <mach.h> + +char* file = "6.out"; +static Fhdr fhdr; +int have_syms; +int fd; +struct Ureg_amd64 ureg_amd64; +struct Ureg_x86 ureg_x86; +int total_sec = 0; +int delta_msec = 100; +int nsample; +int nsamplethread; + +// pprof data, stored as sequences of N followed by N PC values. +// See http://code.google.com/p/google-perftools . +uvlong *ppdata; // traces +Biobuf* pproffd; // file descriptor to write trace info +long ppstart; // start position of current trace +long nppdata; // length of data +long ppalloc; // size of allocated data +char ppmapdata[10*1024]; // the map information for the output file + +// output formats +int pprof; // print pprof output to named file +int functions; // print functions +int histograms; // print histograms +int linenums; // print file and line numbers rather than function names +int registers; // print registers +int stacks; // print stack traces + +int pid; // main process pid + +int nthread; // number of threads +int thread[32]; // thread pids +Map *map[32]; // thread maps + +void +Usage(void) +{ + fprint(2, "Usage: prof -p pid [-t total_secs] [-d delta_msec]\n"); + fprint(2, " prof [-t total_secs] [-d delta_msec] 6.out args ...\n"); + fprint(2, "\tformats (default -h):\n"); + fprint(2, "\t\t-P file.prof: write [c]pprof output to file.prof\n"); + fprint(2, "\t\t-h: histograms\n"); + fprint(2, "\t\t-f: dynamic functions\n"); + fprint(2, "\t\t-l: dynamic file and line numbers\n"); + fprint(2, "\t\t-r: dynamic registers\n"); + fprint(2, "\t\t-s: dynamic function stack traces\n"); + fprint(2, "\t\t-hs: include stack info in histograms\n"); + exit(2); +} + +typedef struct PC PC; +struct PC { + uvlong pc; + uvlong callerpc; + unsigned int count; + PC* next; +}; + +enum { + Ncounters = 256 +}; + +PC *counters[Ncounters]; + +// Set up by setarch() to make most of the code architecture-independent. +typedef struct Arch Arch; +struct Arch { + char* name; + void (*regprint)(void); + int (*getregs)(Map*); + int (*getPC)(Map*); + int (*getSP)(Map*); + uvlong (*uregPC)(void); + uvlong (*uregSP)(void); + void (*ppword)(uvlong w); +}; + +void +amd64_regprint(void) +{ + fprint(2, "ax\t0x%llux\n", ureg_amd64.ax); + fprint(2, "bx\t0x%llux\n", ureg_amd64.bx); + fprint(2, "cx\t0x%llux\n", ureg_amd64.cx); + fprint(2, "dx\t0x%llux\n", ureg_amd64.dx); + fprint(2, "si\t0x%llux\n", ureg_amd64.si); + fprint(2, "di\t0x%llux\n", ureg_amd64.di); + fprint(2, "bp\t0x%llux\n", ureg_amd64.bp); + fprint(2, "r8\t0x%llux\n", ureg_amd64.r8); + fprint(2, "r9\t0x%llux\n", ureg_amd64.r9); + fprint(2, "r10\t0x%llux\n", ureg_amd64.r10); + fprint(2, "r11\t0x%llux\n", ureg_amd64.r11); + fprint(2, "r12\t0x%llux\n", ureg_amd64.r12); + fprint(2, "r13\t0x%llux\n", ureg_amd64.r13); + fprint(2, "r14\t0x%llux\n", ureg_amd64.r14); + fprint(2, "r15\t0x%llux\n", ureg_amd64.r15); + fprint(2, "ds\t0x%llux\n", ureg_amd64.ds); + fprint(2, "es\t0x%llux\n", ureg_amd64.es); + fprint(2, "fs\t0x%llux\n", ureg_amd64.fs); + fprint(2, "gs\t0x%llux\n", ureg_amd64.gs); + fprint(2, "type\t0x%llux\n", ureg_amd64.type); + fprint(2, "error\t0x%llux\n", ureg_amd64.error); + fprint(2, "pc\t0x%llux\n", ureg_amd64.ip); + fprint(2, "cs\t0x%llux\n", ureg_amd64.cs); + fprint(2, "flags\t0x%llux\n", ureg_amd64.flags); + fprint(2, "sp\t0x%llux\n", ureg_amd64.sp); + fprint(2, "ss\t0x%llux\n", ureg_amd64.ss); +} + +int +amd64_getregs(Map *map) +{ + int i; + union { + uvlong regs[1]; + struct Ureg_amd64 ureg; + } u; + + for(i = 0; i < sizeof ureg_amd64; i+=8) { + if(get8(map, (uvlong)i, &u.regs[i/8]) < 0) + return -1; + } + ureg_amd64 = u.ureg; + return 0; +} + +int +amd64_getPC(Map *map) +{ + uvlong x; + int r; + + r = get8(map, offsetof(struct Ureg_amd64, ip), &x); + ureg_amd64.ip = x; + return r; +} + +int +amd64_getSP(Map *map) +{ + uvlong x; + int r; + + r = get8(map, offsetof(struct Ureg_amd64, sp), &x); + ureg_amd64.sp = x; + return r; +} + +uvlong +amd64_uregPC(void) +{ + return ureg_amd64.ip; +} + +uvlong +amd64_uregSP(void) { + return ureg_amd64.sp; +} + +void +amd64_ppword(uvlong w) +{ + uchar buf[8]; + + buf[0] = w; + buf[1] = w >> 8; + buf[2] = w >> 16; + buf[3] = w >> 24; + buf[4] = w >> 32; + buf[5] = w >> 40; + buf[6] = w >> 48; + buf[7] = w >> 56; + Bwrite(pproffd, buf, 8); +} + +void +x86_regprint(void) +{ + fprint(2, "ax\t0x%ux\n", ureg_x86.ax); + fprint(2, "bx\t0x%ux\n", ureg_x86.bx); + fprint(2, "cx\t0x%ux\n", ureg_x86.cx); + fprint(2, "dx\t0x%ux\n", ureg_x86.dx); + fprint(2, "si\t0x%ux\n", ureg_x86.si); + fprint(2, "di\t0x%ux\n", ureg_x86.di); + fprint(2, "bp\t0x%ux\n", ureg_x86.bp); + fprint(2, "ds\t0x%ux\n", ureg_x86.ds); + fprint(2, "es\t0x%ux\n", ureg_x86.es); + fprint(2, "fs\t0x%ux\n", ureg_x86.fs); + fprint(2, "gs\t0x%ux\n", ureg_x86.gs); + fprint(2, "cs\t0x%ux\n", ureg_x86.cs); + fprint(2, "flags\t0x%ux\n", ureg_x86.flags); + fprint(2, "pc\t0x%ux\n", ureg_x86.pc); + fprint(2, "sp\t0x%ux\n", ureg_x86.sp); + fprint(2, "ss\t0x%ux\n", ureg_x86.ss); +} + +int +x86_getregs(Map *map) +{ + int i; + + for(i = 0; i < sizeof ureg_x86; i+=4) { + if(get4(map, (uvlong)i, &((uint32*)&ureg_x86)[i/4]) < 0) + return -1; + } + return 0; +} + +int +x86_getPC(Map* map) +{ + return get4(map, offsetof(struct Ureg_x86, pc), &ureg_x86.pc); +} + +int +x86_getSP(Map* map) +{ + return get4(map, offsetof(struct Ureg_x86, sp), &ureg_x86.sp); +} + +uvlong +x86_uregPC(void) +{ + return (uvlong)ureg_x86.pc; +} + +uvlong +x86_uregSP(void) +{ + return (uvlong)ureg_x86.sp; +} + +void +x86_ppword(uvlong w) +{ + uchar buf[4]; + + buf[0] = w; + buf[1] = w >> 8; + buf[2] = w >> 16; + buf[3] = w >> 24; + Bwrite(pproffd, buf, 4); +} + +Arch archtab[] = { + { + "amd64", + amd64_regprint, + amd64_getregs, + amd64_getPC, + amd64_getSP, + amd64_uregPC, + amd64_uregSP, + amd64_ppword, + }, + { + "386", + x86_regprint, + x86_getregs, + x86_getPC, + x86_getSP, + x86_uregPC, + x86_uregSP, + x86_ppword, + }, + { + nil + } +}; + +Arch *arch; + +int +setarch(void) +{ + int i; + + if(mach != nil) { + for(i = 0; archtab[i].name != nil; i++) { + if (strcmp(mach->name, archtab[i].name) == 0) { + arch = &archtab[i]; + return 0; + } + } + } + return -1; +} + +int +getthreads(void) +{ + int i, j, curn, found; + Map *curmap[nelem(map)]; + int curthread[nelem(map)]; + static int complained = 0; + + curn = procthreadpids(pid, curthread, nelem(curthread)); + if(curn <= 0) + return curn; + + if(curn > nelem(map)) { + if(complained == 0) { + fprint(2, "prof: too many threads; limiting to %d\n", nthread, nelem(map)); + complained = 1; + } + curn = nelem(map); + } + if(curn == nthread && memcmp(thread, curthread, curn*sizeof(*thread)) == 0) + return curn; // no changes + + // Number of threads has changed (might be the init case). + // A bit expensive but rare enough not to bother being clever. + for(i = 0; i < curn; i++) { + found = 0; + for(j = 0; j < nthread; j++) { + if(curthread[i] == thread[j]) { + found = 1; + curmap[i] = map[j]; + map[j] = nil; + break; + } + } + if(found) + continue; + + // map new thread + curmap[i] = attachproc(curthread[i], &fhdr); + if(curmap[i] == nil) { + fprint(2, "prof: can't attach to %d: %r\n", curthread[i]); + return -1; + } + } + + for(j = 0; j < nthread; j++) + if(map[j] != nil) + detachproc(map[j]); + + nthread = curn; + memmove(thread, curthread, nthread*sizeof thread[0]); + memmove(map, curmap, sizeof map); + return nthread; +} + +int +sample(Map *map) +{ + static int n; + + n++; + if(registers) { + if(arch->getregs(map) < 0) + goto bad; + } else { + // we need only two registers + if(arch->getPC(map) < 0) + goto bad; + if(arch->getSP(map) < 0) + goto bad; + } + return 1; +bad: + if(n == 1) + fprint(2, "prof: can't read registers: %r\n"); + return 0; +} + +void +addtohistogram(uvlong pc, uvlong callerpc, uvlong sp) +{ + int h; + PC *x; + + h = (pc + callerpc*101) % Ncounters; + for(x = counters[h]; x != NULL; x = x->next) { + if(x->pc == pc && x->callerpc == callerpc) { + x->count++; + return; + } + } + x = malloc(sizeof(PC)); + x->pc = pc; + x->callerpc = callerpc; + x->count = 1; + x->next = counters[h]; + counters[h] = x; +} + +void +addppword(uvlong pc) +{ + if(pc == 0) { + return; + } + if(nppdata == ppalloc) { + ppalloc = (1000+nppdata)*2; + ppdata = realloc(ppdata, ppalloc * sizeof ppdata[0]); + if(ppdata == nil) { + fprint(2, "prof: realloc failed: %r\n"); + exit(2); + } + } + ppdata[nppdata++] = pc; +} + +void +startpptrace() +{ + ppstart = nppdata; + addppword(~0); +} + +void +endpptrace() +{ + ppdata[ppstart] = nppdata-ppstart-1; +} + +uvlong nextpc; + +void +xptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym) +{ + char buf[1024]; + if(sym == nil){ + fprint(2, "syms\n"); + return; + } + if(histograms) + addtohistogram(nextpc, pc, sp); + if(!histograms || stacks > 1 || pprof) { + if(nextpc == 0) + nextpc = sym->value; + if(stacks){ + fprint(2, "%s(", sym->name); + fprint(2, ")"); + if(nextpc != sym->value) + fprint(2, "+%#llux ", nextpc - sym->value); + if(have_syms && linenums && fileline(buf, sizeof buf, pc)) { + fprint(2, " %s", buf); + } + fprint(2, "\n"); + } + if (pprof) { + addppword(nextpc); + } + } + nextpc = pc; +} + +void +stacktracepcsp(Map *map, uvlong pc, uvlong sp) +{ + nextpc = pc; + if(pprof){ + startpptrace(); + } + if(machdata->ctrace==nil) + fprint(2, "no machdata->ctrace\n"); + else if(machdata->ctrace(map, pc, sp, 0, xptrace) <= 0) + fprint(2, "no stack frame: pc=%#p sp=%#p\n", pc, sp); + else { + addtohistogram(nextpc, 0, sp); + if(stacks) + fprint(2, "\n"); + } + if(pprof){ + endpptrace(); + } +} + +void +printpc(Map *map, uvlong pc, uvlong sp) +{ + char buf[1024]; + if(registers) + arch->regprint(); + if(have_syms > 0 && linenums && fileline(buf, sizeof buf, pc)) + fprint(2, "%s\n", buf); + if(have_syms > 0 && functions) { + symoff(buf, sizeof(buf), pc, CANY); + fprint(2, "%s\n", buf); + } + if(stacks || pprof){ + stacktracepcsp(map, pc, sp); + } + else if(histograms){ + addtohistogram(pc, 0, sp); + } +} + +void +ppmaps(void) +{ + int fd, n; + char tmp[100]; + Seg *seg; + + // If it's Linux, the info is in /proc/$pid/maps + snprint(tmp, sizeof tmp, "/proc/%d/maps", pid); + fd = open(tmp, 0); + if(fd >= 0) { + n = read(fd, ppmapdata, sizeof ppmapdata - 1); + close(fd); + if(n < 0) { + fprint(2, "prof: can't read %s: %r\n", tmp); + exit(2); + } + ppmapdata[n] = 0; + return; + } + + // It's probably a mac. Synthesize an entry for the text file. + // The register segment may come first but it has a zero offset, so grab the first non-zero offset segment. + for(n = 0; n < 3; n++){ + seg = &map[0]->seg[n]; + if(seg->b == 0) { + continue; + } + snprint(ppmapdata, sizeof ppmapdata, + "%.16x-%.16x r-xp %d 00:00 34968549 %s\n", + seg->b, seg->e, seg->f, "/home/r/6.out" + ); + return; + } + fprint(2, "prof: no text segment in maps for %s\n", file); + exit(2); +} + +void +samples(void) +{ + int i, pid, msec; + struct timespec req; + int getmaps; + + req.tv_sec = delta_msec/1000; + req.tv_nsec = 1000000*(delta_msec % 1000); + getmaps = 0; + if(pprof) + getmaps= 1; + for(msec = 0; total_sec <= 0 || msec < 1000*total_sec; msec += delta_msec) { + nsample++; + nsamplethread += nthread; + for(i = 0; i < nthread; i++) { + pid = thread[i]; + if(ctlproc(pid, "stop") < 0) + return; + if(!sample(map[i])) { + ctlproc(pid, "start"); + return; + } + printpc(map[i], arch->uregPC(), arch->uregSP()); + ctlproc(pid, "start"); + } + nanosleep(&req, NULL); + getthreads(); + if(nthread == 0) + break; + if(getmaps) { + getmaps = 0; + ppmaps(); + } + } +} + +typedef struct Func Func; +struct Func +{ + Func *next; + Symbol s; + uint onstack; + uint leaf; +}; + +Func *func[257]; +int nfunc; + +Func* +findfunc(uvlong pc) +{ + Func *f; + uint h; + Symbol s; + + if(pc == 0) + return nil; + + if(!findsym(pc, CTEXT, &s)) + return nil; + + h = s.value % nelem(func); + for(f = func[h]; f != NULL; f = f->next) + if(f->s.value == s.value) + return f; + + f = malloc(sizeof *f); + memset(f, 0, sizeof *f); + f->s = s; + f->next = func[h]; + func[h] = f; + nfunc++; + return f; +} + +int +compareleaf(const void *va, const void *vb) +{ + Func *a, *b; + + a = *(Func**)va; + b = *(Func**)vb; + if(a->leaf != b->leaf) + return b->leaf - a->leaf; + if(a->onstack != b->onstack) + return b->onstack - a->onstack; + return strcmp(a->s.name, b->s.name); +} + +void +dumphistogram() +{ + int i, h, n; + PC *x; + Func *f, **ff; + + if(!histograms) + return; + + // assign counts to functions. + for(h = 0; h < Ncounters; h++) { + for(x = counters[h]; x != NULL; x = x->next) { + f = findfunc(x->pc); + if(f) { + f->onstack += x->count; + f->leaf += x->count; + } + f = findfunc(x->callerpc); + if(f) + f->leaf -= x->count; + } + } + + // build array + ff = malloc(nfunc*sizeof ff[0]); + n = 0; + for(h = 0; h < nelem(func); h++) + for(f = func[h]; f != NULL; f = f->next) + ff[n++] = f; + + // sort by leaf counts + qsort(ff, nfunc, sizeof ff[0], compareleaf); + + // print. + fprint(2, "%d samples (avg %.1g threads)\n", nsample, (double)nsamplethread/nsample); + for(i = 0; i < nfunc; i++) { + f = ff[i]; + fprint(2, "%6.2f%%\t", 100.0*(double)f->leaf/nsample); + if(stacks) + fprint(2, "%6.2f%%\t", 100.0*(double)f->onstack/nsample); + fprint(2, "%s\n", f->s.name); + } +} + +typedef struct Trace Trace; +struct Trace { + int count; + int npc; + uvlong *pc; + Trace *next; +}; + +void +dumppprof() +{ + uvlong i, n, *p, *e; + int ntrace; + Trace *trace, *tp, *up, *prev; + + if(!pprof) + return; + e = ppdata + nppdata; + // Create list of traces. First, count the traces + ntrace = 0; + for(p = ppdata; p < e;) { + n = *p++; + p += n; + if(n == 0) + continue; + ntrace++; + } + if(ntrace <= 0) + return; + // Allocate and link the traces together. + trace = malloc(ntrace * sizeof(Trace)); + tp = trace; + for(p = ppdata; p < e;) { + n = *p++; + if(n == 0) + continue; + tp->count = 1; + tp->npc = n; + tp->pc = p; + tp->next = tp+1; + tp++; + p += n; + } + trace[ntrace-1].next = nil; + // Eliminate duplicates. Lousy algorithm, although not as bad as it looks because + // the list collapses fast. + for(tp = trace; tp != nil; tp = tp->next) { + prev = tp; + for(up = tp->next; up != nil; up = up->next) { + if(up->npc == tp->npc && memcmp(up->pc, tp->pc, up->npc*sizeof up->pc[0]) == 0) { + tp->count++; + prev->next = up->next; + } else { + prev = up; + } + } + } + // Write file. + // See http://code.google.com/p/google-perftools/source/browse/trunk/doc/cpuprofile-fileformat.html + // 1) Header + arch->ppword(0); // must be zero + arch->ppword(3); // 3 words follow in header + arch->ppword(0); // must be zero + arch->ppword(delta_msec * 1000); // sampling period in microseconds + arch->ppword(0); // must be zero (padding) + // 2) One record for each trace. + for(tp = trace; tp != nil; tp = tp->next) { + arch->ppword(tp->count); + arch->ppword(tp->npc); + for(i = 0; i < tp->npc; i++) { + arch->ppword(tp->pc[i]); + } + } + // 3) Binary trailer + arch->ppword(0); // must be zero + arch->ppword(1); // must be one + arch->ppword(0); // must be zero + // 4) Mapped objects. + Bwrite(pproffd, ppmapdata, strlen(ppmapdata)); + // 5) That's it. + Bterm(pproffd); +} + +int +startprocess(char **argv) +{ + int pid; + + if((pid = fork()) == 0) { + pid = getpid(); + if(ctlproc(pid, "hang") < 0){ + fprint(2, "prof: child process could not hang\n"); + exits(0); + } + execv(argv[0], argv); + fprint(2, "prof: could not exec %s: %r\n", argv[0]); + exits(0); + } + + if(pid == -1) { + fprint(2, "prof: could not fork\n"); + exit(1); + } + if(ctlproc(pid, "attached") < 0 || ctlproc(pid, "waitstop") < 0) { + fprint(2, "prof: could not attach to child process: %r\n"); + exit(1); + } + return pid; +} + +void +detach(void) +{ + int i; + + for(i = 0; i < nthread; i++) + detachproc(map[i]); +} + +int +main(int argc, char *argv[]) +{ + int i; + char *ppfile; + + ARGBEGIN{ + case 'P': + pprof =1; + ppfile = EARGF(Usage()); + pproffd = Bopen(ppfile, OWRITE); + if(pproffd == nil) { + fprint(2, "prof: cannot open %s: %r\n", ppfile); + exit(2); + } + break; + case 'd': + delta_msec = atoi(EARGF(Usage())); + break; + case 't': + total_sec = atoi(EARGF(Usage())); + break; + case 'p': + pid = atoi(EARGF(Usage())); + break; + case 'f': + functions = 1; + break; + case 'h': + histograms = 1; + break; + case 'l': + linenums = 1; + break; + case 'r': + registers = 1; + break; + case 's': + stacks++; + break; + default: + Usage(); + }ARGEND + if(pid <= 0 && argc == 0) + Usage(); + if(functions+linenums+registers+stacks+pprof == 0) + histograms = 1; + if(!machbyname("amd64")) { + fprint(2, "prof: no amd64 support\n", pid); + exit(1); + } + if(argc > 0) + file = argv[0]; + else if(pid) { + file = proctextfile(pid); + if (file == NULL) { + fprint(2, "prof: can't find file for pid %d: %r\n", pid); + fprint(2, "prof: on Darwin, need to provide file name explicitly\n"); + exit(1); + } + } + fd = open(file, 0); + if(fd < 0) { + fprint(2, "prof: can't open %s: %r\n", file); + exit(1); + } + if(crackhdr(fd, &fhdr)) { + have_syms = syminit(fd, &fhdr); + if(!have_syms) { + fprint(2, "prof: no symbols for %s: %r\n", file); + } + } else { + fprint(2, "prof: crack header for %s: %r\n", file); + exit(1); + } + if(pid <= 0) + pid = startprocess(argv); + attachproc(pid, &fhdr); // initializes thread list + if(setarch() < 0) { + detach(); + fprint(2, "prof: can't identify binary architecture for pid %d\n", pid); + exit(1); + } + if(getthreads() <= 0) { + detach(); + fprint(2, "prof: can't find threads for pid %d\n", pid); + exit(1); + } + for(i = 0; i < nthread; i++) + ctlproc(thread[i], "start"); + samples(); + detach(); + dumphistogram(); + dumppprof(); + exit(0); +} |