// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include #include #include #include #include #define Ureg Ureg_amd64 #include #undef Ureg #define Ureg Ureg_x86 #include #undef Ureg #include char* file = "6.out"; static Fhdr fhdr; int have_syms; int fd; struct Ureg_amd64 ureg_amd64; struct Ureg_x86 ureg_x86; int total_sec = 0; int delta_msec = 100; int nsample; int nsamplethread; // pprof data, stored as sequences of N followed by N PC values. // See http://code.google.com/p/google-perftools . uvlong *ppdata; // traces Biobuf* pproffd; // file descriptor to write trace info long ppstart; // start position of current trace long nppdata; // length of data long ppalloc; // size of allocated data char ppmapdata[10*1024]; // the map information for the output file // output formats int pprof; // print pprof output to named file int functions; // print functions int histograms; // print histograms int linenums; // print file and line numbers rather than function names int registers; // print registers int stacks; // print stack traces int pid; // main process pid int nthread; // number of threads int thread[32]; // thread pids Map *map[32]; // thread maps void Usage(void) { fprint(2, "Usage: prof -p pid [-t total_secs] [-d delta_msec]\n"); fprint(2, " prof [-t total_secs] [-d delta_msec] 6.out args ...\n"); fprint(2, "\tformats (default -h):\n"); fprint(2, "\t\t-P file.prof: write [c]pprof output to file.prof\n"); fprint(2, "\t\t-h: histograms\n"); fprint(2, "\t\t-f: dynamic functions\n"); fprint(2, "\t\t-l: dynamic file and line numbers\n"); fprint(2, "\t\t-r: dynamic registers\n"); fprint(2, "\t\t-s: dynamic function stack traces\n"); fprint(2, "\t\t-hs: include stack info in histograms\n"); exit(2); } typedef struct PC PC; struct PC { uvlong pc; uvlong callerpc; unsigned int count; PC* next; }; enum { Ncounters = 256 }; PC *counters[Ncounters]; // Set up by setarch() to make most of the code architecture-independent. typedef struct Arch Arch; struct Arch { char* name; void (*regprint)(void); int (*getregs)(Map*); int (*getPC)(Map*); int (*getSP)(Map*); uvlong (*uregPC)(void); uvlong (*uregSP)(void); void (*ppword)(uvlong w); }; void amd64_regprint(void) { fprint(2, "ax\t0x%llux\n", ureg_amd64.ax); fprint(2, "bx\t0x%llux\n", ureg_amd64.bx); fprint(2, "cx\t0x%llux\n", ureg_amd64.cx); fprint(2, "dx\t0x%llux\n", ureg_amd64.dx); fprint(2, "si\t0x%llux\n", ureg_amd64.si); fprint(2, "di\t0x%llux\n", ureg_amd64.di); fprint(2, "bp\t0x%llux\n", ureg_amd64.bp); fprint(2, "r8\t0x%llux\n", ureg_amd64.r8); fprint(2, "r9\t0x%llux\n", ureg_amd64.r9); fprint(2, "r10\t0x%llux\n", ureg_amd64.r10); fprint(2, "r11\t0x%llux\n", ureg_amd64.r11); fprint(2, "r12\t0x%llux\n", ureg_amd64.r12); fprint(2, "r13\t0x%llux\n", ureg_amd64.r13); fprint(2, "r14\t0x%llux\n", ureg_amd64.r14); fprint(2, "r15\t0x%llux\n", ureg_amd64.r15); fprint(2, "ds\t0x%llux\n", ureg_amd64.ds); fprint(2, "es\t0x%llux\n", ureg_amd64.es); fprint(2, "fs\t0x%llux\n", ureg_amd64.fs); fprint(2, "gs\t0x%llux\n", ureg_amd64.gs); fprint(2, "type\t0x%llux\n", ureg_amd64.type); fprint(2, "error\t0x%llux\n", ureg_amd64.error); fprint(2, "pc\t0x%llux\n", ureg_amd64.ip); fprint(2, "cs\t0x%llux\n", ureg_amd64.cs); fprint(2, "flags\t0x%llux\n", ureg_amd64.flags); fprint(2, "sp\t0x%llux\n", ureg_amd64.sp); fprint(2, "ss\t0x%llux\n", ureg_amd64.ss); } int amd64_getregs(Map *map) { int i; union { uvlong regs[1]; struct Ureg_amd64 ureg; } u; for(i = 0; i < sizeof ureg_amd64; i+=8) { if(get8(map, (uvlong)i, &u.regs[i/8]) < 0) return -1; } ureg_amd64 = u.ureg; return 0; } int amd64_getPC(Map *map) { uvlong x; int r; r = get8(map, offsetof(struct Ureg_amd64, ip), &x); ureg_amd64.ip = x; return r; } int amd64_getSP(Map *map) { uvlong x; int r; r = get8(map, offsetof(struct Ureg_amd64, sp), &x); ureg_amd64.sp = x; return r; } uvlong amd64_uregPC(void) { return ureg_amd64.ip; } uvlong amd64_uregSP(void) { return ureg_amd64.sp; } void amd64_ppword(uvlong w) { uchar buf[8]; buf[0] = w; buf[1] = w >> 8; buf[2] = w >> 16; buf[3] = w >> 24; buf[4] = w >> 32; buf[5] = w >> 40; buf[6] = w >> 48; buf[7] = w >> 56; Bwrite(pproffd, buf, 8); } void x86_regprint(void) { fprint(2, "ax\t0x%ux\n", ureg_x86.ax); fprint(2, "bx\t0x%ux\n", ureg_x86.bx); fprint(2, "cx\t0x%ux\n", ureg_x86.cx); fprint(2, "dx\t0x%ux\n", ureg_x86.dx); fprint(2, "si\t0x%ux\n", ureg_x86.si); fprint(2, "di\t0x%ux\n", ureg_x86.di); fprint(2, "bp\t0x%ux\n", ureg_x86.bp); fprint(2, "ds\t0x%ux\n", ureg_x86.ds); fprint(2, "es\t0x%ux\n", ureg_x86.es); fprint(2, "fs\t0x%ux\n", ureg_x86.fs); fprint(2, "gs\t0x%ux\n", ureg_x86.gs); fprint(2, "cs\t0x%ux\n", ureg_x86.cs); fprint(2, "flags\t0x%ux\n", ureg_x86.flags); fprint(2, "pc\t0x%ux\n", ureg_x86.pc); fprint(2, "sp\t0x%ux\n", ureg_x86.sp); fprint(2, "ss\t0x%ux\n", ureg_x86.ss); } int x86_getregs(Map *map) { int i; for(i = 0; i < sizeof ureg_x86; i+=4) { if(get4(map, (uvlong)i, &((uint32*)&ureg_x86)[i/4]) < 0) return -1; } return 0; } int x86_getPC(Map* map) { return get4(map, offsetof(struct Ureg_x86, pc), &ureg_x86.pc); } int x86_getSP(Map* map) { return get4(map, offsetof(struct Ureg_x86, sp), &ureg_x86.sp); } uvlong x86_uregPC(void) { return (uvlong)ureg_x86.pc; } uvlong x86_uregSP(void) { return (uvlong)ureg_x86.sp; } void x86_ppword(uvlong w) { uchar buf[4]; buf[0] = w; buf[1] = w >> 8; buf[2] = w >> 16; buf[3] = w >> 24; Bwrite(pproffd, buf, 4); } Arch archtab[] = { { "amd64", amd64_regprint, amd64_getregs, amd64_getPC, amd64_getSP, amd64_uregPC, amd64_uregSP, amd64_ppword, }, { "386", x86_regprint, x86_getregs, x86_getPC, x86_getSP, x86_uregPC, x86_uregSP, x86_ppword, }, { nil } }; Arch *arch; int setarch(void) { int i; if(mach != nil) { for(i = 0; archtab[i].name != nil; i++) { if (strcmp(mach->name, archtab[i].name) == 0) { arch = &archtab[i]; return 0; } } } return -1; } int getthreads(void) { int i, j, curn, found; Map *curmap[nelem(map)]; int curthread[nelem(map)]; static int complained = 0; curn = procthreadpids(pid, curthread, nelem(curthread)); if(curn <= 0) return curn; if(curn > nelem(map)) { if(complained == 0) { fprint(2, "prof: too many threads; limiting to %d\n", nthread, nelem(map)); complained = 1; } curn = nelem(map); } if(curn == nthread && memcmp(thread, curthread, curn*sizeof(*thread)) == 0) return curn; // no changes // Number of threads has changed (might be the init case). // A bit expensive but rare enough not to bother being clever. for(i = 0; i < curn; i++) { found = 0; for(j = 0; j < nthread; j++) { if(curthread[i] == thread[j]) { found = 1; curmap[i] = map[j]; map[j] = nil; break; } } if(found) continue; // map new thread curmap[i] = attachproc(curthread[i], &fhdr); if(curmap[i] == nil) { fprint(2, "prof: can't attach to %d: %r\n", curthread[i]); return -1; } } for(j = 0; j < nthread; j++) if(map[j] != nil) detachproc(map[j]); nthread = curn; memmove(thread, curthread, nthread*sizeof thread[0]); memmove(map, curmap, sizeof map); return nthread; } int sample(Map *map) { static int n; n++; if(registers) { if(arch->getregs(map) < 0) goto bad; } else { // we need only two registers if(arch->getPC(map) < 0) goto bad; if(arch->getSP(map) < 0) goto bad; } return 1; bad: if(n == 1) fprint(2, "prof: can't read registers: %r\n"); return 0; } void addtohistogram(uvlong pc, uvlong callerpc, uvlong sp) { int h; PC *x; h = (pc + callerpc*101) % Ncounters; for(x = counters[h]; x != NULL; x = x->next) { if(x->pc == pc && x->callerpc == callerpc) { x->count++; return; } } x = malloc(sizeof(PC)); x->pc = pc; x->callerpc = callerpc; x->count = 1; x->next = counters[h]; counters[h] = x; } void addppword(uvlong pc) { if(pc == 0) { return; } if(nppdata == ppalloc) { ppalloc = (1000+nppdata)*2; ppdata = realloc(ppdata, ppalloc * sizeof ppdata[0]); if(ppdata == nil) { fprint(2, "prof: realloc failed: %r\n"); exit(2); } } ppdata[nppdata++] = pc; } void startpptrace() { ppstart = nppdata; addppword(~0); } void endpptrace() { ppdata[ppstart] = nppdata-ppstart-1; } uvlong nextpc; void xptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym) { char buf[1024]; if(sym == nil){ fprint(2, "syms\n"); return; } if(histograms) addtohistogram(nextpc, pc, sp); if(!histograms || stacks > 1 || pprof) { if(nextpc == 0) nextpc = sym->value; if(stacks){ fprint(2, "%s(", sym->name); fprint(2, ")"); if(nextpc != sym->value) fprint(2, "+%#llux ", nextpc - sym->value); if(have_syms && linenums && fileline(buf, sizeof buf, pc)) { fprint(2, " %s", buf); } fprint(2, "\n"); } if (pprof) { addppword(nextpc); } } nextpc = pc; } void stacktracepcsp(Map *map, uvlong pc, uvlong sp) { nextpc = pc; if(pprof){ startpptrace(); } if(machdata->ctrace==nil) fprint(2, "no machdata->ctrace\n"); else if(machdata->ctrace(map, pc, sp, 0, xptrace) <= 0) fprint(2, "no stack frame: pc=%#p sp=%#p\n", pc, sp); else { addtohistogram(nextpc, 0, sp); if(stacks) fprint(2, "\n"); } if(pprof){ endpptrace(); } } void printpc(Map *map, uvlong pc, uvlong sp) { char buf[1024]; if(registers) arch->regprint(); if(have_syms > 0 && linenums && fileline(buf, sizeof buf, pc)) fprint(2, "%s\n", buf); if(have_syms > 0 && functions) { symoff(buf, sizeof(buf), pc, CANY); fprint(2, "%s\n", buf); } if(stacks || pprof){ stacktracepcsp(map, pc, sp); } else if(histograms){ addtohistogram(pc, 0, sp); } } void ppmaps(void) { int fd, n; char tmp[100]; Seg *seg; // If it's Linux, the info is in /proc/$pid/maps snprint(tmp, sizeof tmp, "/proc/%d/maps", pid); fd = open(tmp, 0); if(fd >= 0) { n = read(fd, ppmapdata, sizeof ppmapdata - 1); close(fd); if(n < 0) { fprint(2, "prof: can't read %s: %r\n", tmp); exit(2); } ppmapdata[n] = 0; return; } // It's probably a mac. Synthesize an entry for the text file. // The register segment may come first but it has a zero offset, so grab the first non-zero offset segment. for(n = 0; n < 3; n++){ seg = &map[0]->seg[n]; if(seg->b == 0) { continue; } snprint(ppmapdata, sizeof ppmapdata, "%.16x-%.16x r-xp %d 00:00 34968549 %s\n", seg->b, seg->e, seg->f, "/home/r/6.out" ); return; } fprint(2, "prof: no text segment in maps for %s\n", file); exit(2); } void samples(void) { int i, pid, msec; struct timespec req; int getmaps; req.tv_sec = delta_msec/1000; req.tv_nsec = 1000000*(delta_msec % 1000); getmaps = 0; if(pprof) getmaps= 1; for(msec = 0; total_sec <= 0 || msec < 1000*total_sec; msec += delta_msec) { nsample++; nsamplethread += nthread; for(i = 0; i < nthread; i++) { pid = thread[i]; if(ctlproc(pid, "stop") < 0) return; if(!sample(map[i])) { ctlproc(pid, "start"); return; } printpc(map[i], arch->uregPC(), arch->uregSP()); ctlproc(pid, "start"); } nanosleep(&req, NULL); getthreads(); if(nthread == 0) break; if(getmaps) { getmaps = 0; ppmaps(); } } } typedef struct Func Func; struct Func { Func *next; Symbol s; uint onstack; uint leaf; }; Func *func[257]; int nfunc; Func* findfunc(uvlong pc) { Func *f; uint h; Symbol s; if(pc == 0) return nil; if(!findsym(pc, CTEXT, &s)) return nil; h = s.value % nelem(func); for(f = func[h]; f != NULL; f = f->next) if(f->s.value == s.value) return f; f = malloc(sizeof *f); memset(f, 0, sizeof *f); f->s = s; f->next = func[h]; func[h] = f; nfunc++; return f; } int compareleaf(const void *va, const void *vb) { Func *a, *b; a = *(Func**)va; b = *(Func**)vb; if(a->leaf != b->leaf) return b->leaf - a->leaf; if(a->onstack != b->onstack) return b->onstack - a->onstack; return strcmp(a->s.name, b->s.name); } void dumphistogram() { int i, h, n; PC *x; Func *f, **ff; if(!histograms) return; // assign counts to functions. for(h = 0; h < Ncounters; h++) { for(x = counters[h]; x != NULL; x = x->next) { f = findfunc(x->pc); if(f) { f->onstack += x->count; f->leaf += x->count; } f = findfunc(x->callerpc); if(f) f->leaf -= x->count; } } // build array ff = malloc(nfunc*sizeof ff[0]); n = 0; for(h = 0; h < nelem(func); h++) for(f = func[h]; f != NULL; f = f->next) ff[n++] = f; // sort by leaf counts qsort(ff, nfunc, sizeof ff[0], compareleaf); // print. fprint(2, "%d samples (avg %.1g threads)\n", nsample, (double)nsamplethread/nsample); for(i = 0; i < nfunc; i++) { f = ff[i]; fprint(2, "%6.2f%%\t", 100.0*(double)f->leaf/nsample); if(stacks) fprint(2, "%6.2f%%\t", 100.0*(double)f->onstack/nsample); fprint(2, "%s\n", f->s.name); } } typedef struct Trace Trace; struct Trace { int count; int npc; uvlong *pc; Trace *next; }; void dumppprof() { uvlong i, n, *p, *e; int ntrace; Trace *trace, *tp, *up, *prev; if(!pprof) return; e = ppdata + nppdata; // Create list of traces. First, count the traces ntrace = 0; for(p = ppdata; p < e;) { n = *p++; p += n; if(n == 0) continue; ntrace++; } if(ntrace <= 0) return; // Allocate and link the traces together. trace = malloc(ntrace * sizeof(Trace)); tp = trace; for(p = ppdata; p < e;) { n = *p++; if(n == 0) continue; tp->count = 1; tp->npc = n; tp->pc = p; tp->next = tp+1; tp++; p += n; } trace[ntrace-1].next = nil; // Eliminate duplicates. Lousy algorithm, although not as bad as it looks because // the list collapses fast. for(tp = trace; tp != nil; tp = tp->next) { prev = tp; for(up = tp->next; up != nil; up = up->next) { if(up->npc == tp->npc && memcmp(up->pc, tp->pc, up->npc*sizeof up->pc[0]) == 0) { tp->count++; prev->next = up->next; } else { prev = up; } } } // Write file. // See http://code.google.com/p/google-perftools/source/browse/trunk/doc/cpuprofile-fileformat.html // 1) Header arch->ppword(0); // must be zero arch->ppword(3); // 3 words follow in header arch->ppword(0); // must be zero arch->ppword(delta_msec * 1000); // sampling period in microseconds arch->ppword(0); // must be zero (padding) // 2) One record for each trace. for(tp = trace; tp != nil; tp = tp->next) { arch->ppword(tp->count); arch->ppword(tp->npc); for(i = 0; i < tp->npc; i++) { arch->ppword(tp->pc[i]); } } // 3) Binary trailer arch->ppword(0); // must be zero arch->ppword(1); // must be one arch->ppword(0); // must be zero // 4) Mapped objects. Bwrite(pproffd, ppmapdata, strlen(ppmapdata)); // 5) That's it. Bterm(pproffd); } int startprocess(char **argv) { int pid; if((pid = fork()) == 0) { pid = getpid(); if(ctlproc(pid, "hang") < 0){ fprint(2, "prof: child process could not hang\n"); exits(0); } execv(argv[0], argv); fprint(2, "prof: could not exec %s: %r\n", argv[0]); exits(0); } if(pid == -1) { fprint(2, "prof: could not fork\n"); exit(1); } if(ctlproc(pid, "attached") < 0 || ctlproc(pid, "waitstop") < 0) { fprint(2, "prof: could not attach to child process: %r\n"); exit(1); } return pid; } void detach(void) { int i; for(i = 0; i < nthread; i++) detachproc(map[i]); } int main(int argc, char *argv[]) { int i; char *ppfile; ARGBEGIN{ case 'P': pprof =1; ppfile = EARGF(Usage()); pproffd = Bopen(ppfile, OWRITE); if(pproffd == nil) { fprint(2, "prof: cannot open %s: %r\n", ppfile); exit(2); } break; case 'd': delta_msec = atoi(EARGF(Usage())); break; case 't': total_sec = atoi(EARGF(Usage())); break; case 'p': pid = atoi(EARGF(Usage())); break; case 'f': functions = 1; break; case 'h': histograms = 1; break; case 'l': linenums = 1; break; case 'r': registers = 1; break; case 's': stacks++; break; default: Usage(); }ARGEND if(pid <= 0 && argc == 0) Usage(); if(functions+linenums+registers+stacks+pprof == 0) histograms = 1; if(!machbyname("amd64")) { fprint(2, "prof: no amd64 support\n", pid); exit(1); } if(argc > 0) file = argv[0]; else if(pid) { file = proctextfile(pid); if (file == NULL) { fprint(2, "prof: can't find file for pid %d: %r\n", pid); fprint(2, "prof: on Darwin, need to provide file name explicitly\n"); exit(1); } } fd = open(file, 0); if(fd < 0) { fprint(2, "prof: can't open %s: %r\n", file); exit(1); } if(crackhdr(fd, &fhdr)) { have_syms = syminit(fd, &fhdr); if(!have_syms) { fprint(2, "prof: no symbols for %s: %r\n", file); } } else { fprint(2, "prof: crack header for %s: %r\n", file); exit(1); } if(pid <= 0) pid = startprocess(argv); attachproc(pid, &fhdr); // initializes thread list if(setarch() < 0) { detach(); fprint(2, "prof: can't identify binary architecture for pid %d\n", pid); exit(1); } if(getthreads() <= 0) { detach(); fprint(2, "prof: can't find threads for pid %d\n", pid); exit(1); } for(i = 0; i < nthread; i++) ctlproc(thread[i], "start"); samples(); detach(); dumphistogram(); dumppprof(); exit(0); }