1 |
dpavlin |
1 |
/* |
2 |
|
|
* Contribution of Mtve. |
3 |
|
|
*/ |
4 |
|
|
|
5 |
|
|
#include <fcntl.h> |
6 |
|
|
#include <unistd.h> |
7 |
|
|
#include <stdio.h> |
8 |
|
|
#include <errno.h> |
9 |
|
|
#include <string.h> |
10 |
|
|
|
11 |
|
|
#define THREADED 1 |
12 |
|
|
|
13 |
|
|
#if THREADED |
14 |
|
|
#include <pthread.h> |
15 |
|
|
#endif |
16 |
|
|
|
17 |
|
|
/* |
18 |
|
|
* Call to profiling routine .mcount is automatically inserted by gcc -p. |
19 |
|
|
* |
20 |
|
|
* However, standard .mcount from (g)libc is not working well for me |
21 |
|
|
* with optimized (-O3 -fomit-frame-pointer) threaded code, |
22 |
|
|
* at least because it doesn't save all registers. |
23 |
|
|
* |
24 |
|
|
* So here is another square wheel. It works only on IA32 (i386). |
25 |
|
|
* |
26 |
|
|
* Theory: |
27 |
|
|
* .mcount is called like this |
28 |
|
|
* |
29 |
|
|
* 08048479 <some_func>: |
30 |
|
|
* 8048679: 55 push %ebp # can be |
31 |
|
|
* 804847a: 89 e5 movl %esp,%ebp # absent |
32 |
|
|
* 804847c: 83 ec 1c subl $28,%esp |
33 |
|
|
* 804847f: 55 pushl %ebp |
34 |
|
|
* 8048480: 57 pushl %edi |
35 |
|
|
* 8048481: 56 pushl %esi |
36 |
|
|
* 8048482: 53 pushl %ebx |
37 |
|
|
* 8048483: e8 94 fe ff ff call .mcount |
38 |
|
|
* 08048488 <some_func_x> |
39 |
|
|
* |
40 |
|
|
* So in the entrance of .mcount we have in stack |
41 |
|
|
* |
42 |
|
|
* %esp -> some_func_x (dword), where mcount should return |
43 |
|
|
* saved registers (4 dwords in example) |
44 |
|
|
* stack frame (28 bytes in example) |
45 |
|
|
* some_func_callee (dword) |
46 |
|
|
* |
47 |
|
|
* We will: |
48 |
|
|
* - check if the code of some_func matches this pattern |
49 |
|
|
* - find some_func address and depth of stack |
50 |
|
|
* - modify stack by replacing some_func_callee address to ours |
51 |
|
|
* - collect statistic |
52 |
|
|
*/ |
53 |
|
|
|
54 |
|
|
/* better to be a prime number */ |
55 |
|
|
#define FUNCSMAX 32749 |
56 |
|
|
|
57 |
|
|
#define CSTACKSIZE 256 |
58 |
|
|
|
59 |
|
|
#if THREADED |
60 |
|
|
/* better to be a prime number */ |
61 |
|
|
#define THREADSMAX 37 |
62 |
|
|
#else |
63 |
|
|
#define THREADSMAX 1 |
64 |
|
|
#endif |
65 |
|
|
|
66 |
|
|
static struct { |
67 |
|
|
int addr; |
68 |
|
|
int enters; |
69 |
|
|
int exits; |
70 |
|
|
int aways; |
71 |
|
|
int rets; |
72 |
|
|
long long timetotal; |
73 |
|
|
long long timeoutside; |
74 |
|
|
} arr[FUNCSMAX + 1]; |
75 |
|
|
|
76 |
|
|
static struct { |
77 |
|
|
#if THREADED |
78 |
|
|
pthread_t tid; |
79 |
|
|
#endif |
80 |
|
|
int depth; |
81 |
|
|
int ret[CSTACKSIZE]; |
82 |
|
|
int func[CSTACKSIZE]; |
83 |
|
|
} cstack[THREADSMAX]; |
84 |
|
|
|
85 |
|
|
#define core() (*(char *)0 = 0) |
86 |
|
|
|
87 |
|
|
#if __GNUC__ > 2 |
88 |
|
|
|
89 |
|
|
#define NOPROF __attribute__ ((no_instrument_function)) |
90 |
|
|
|
91 |
|
|
/* forward declaration of all functions */ |
92 |
|
|
static inline long long curtime() NOPROF; |
93 |
|
|
static inline int findaddr() NOPROF; |
94 |
|
|
static inline int findthread() NOPROF; |
95 |
|
|
static void stat_enter() NOPROF; |
96 |
|
|
static void stat_exit() NOPROF; |
97 |
|
|
static void stat_away() NOPROF; |
98 |
|
|
static void stat_ret() NOPROF; |
99 |
|
|
void profiler__asm_enter_stub() NOPROF; |
100 |
|
|
void profiler__asm_exit_stub() NOPROF; |
101 |
|
|
void profiler__c_enter() NOPROF; |
102 |
|
|
void profiler__c_exit() NOPROF; |
103 |
|
|
void profiler_savestat() NOPROF; |
104 |
|
|
|
105 |
|
|
#else |
106 |
|
|
#warning be sure to compile profiler.c WITHOUT -p flag |
107 |
|
|
#endif |
108 |
|
|
|
109 |
|
|
static inline long long curtime(void) |
110 |
|
|
{ |
111 |
|
|
long long t; |
112 |
|
|
|
113 |
|
|
asm volatile(".byte 15;.byte 49" : "=A"(t)); /* RDTSC */ |
114 |
|
|
return t; |
115 |
|
|
} |
116 |
|
|
|
117 |
|
|
static inline int findaddr(int addr) |
118 |
|
|
{ |
119 |
|
|
int i,j; |
120 |
|
|
|
121 |
|
|
i = j = addr % FUNCSMAX; |
122 |
|
|
do { |
123 |
|
|
if (arr[i].addr == addr) { |
124 |
|
|
return i; |
125 |
|
|
} else if (arr[i].addr == 0) { |
126 |
|
|
arr[i].addr = addr; |
127 |
|
|
return i; |
128 |
|
|
} |
129 |
|
|
i = (i+1) % FUNCSMAX; |
130 |
|
|
} while (i != j); |
131 |
|
|
core(); /* increase FUNCSMAX */ |
132 |
|
|
return(FUNCSMAX); |
133 |
|
|
} |
134 |
|
|
|
135 |
|
|
static inline int findthread(void) |
136 |
|
|
{ |
137 |
|
|
#if THREADED |
138 |
|
|
int i,j; |
139 |
|
|
pthread_t k = pthread_self(); |
140 |
|
|
|
141 |
|
|
i = j = (int)k % THREADSMAX; |
142 |
|
|
do { |
143 |
|
|
if (cstack[i].tid == k) { |
144 |
|
|
return i; |
145 |
|
|
} else if (cstack[i].tid == 0) { |
146 |
|
|
cstack[i].tid = k; |
147 |
|
|
return i; |
148 |
|
|
} |
149 |
|
|
i = (i+1) % THREADSMAX; |
150 |
|
|
} while (i != j); |
151 |
|
|
core(); /* increase THREADSMAX */ |
152 |
|
|
#endif |
153 |
|
|
return(0); |
154 |
|
|
} |
155 |
|
|
|
156 |
|
|
static void stat_enter(int slot) |
157 |
|
|
{ |
158 |
|
|
arr[slot].enters++; |
159 |
|
|
arr[slot].timetotal -= curtime(); |
160 |
|
|
} |
161 |
|
|
|
162 |
|
|
static void stat_exit(int slot) |
163 |
|
|
{ |
164 |
|
|
arr[slot].exits++; |
165 |
|
|
arr[slot].timetotal += curtime(); |
166 |
|
|
} |
167 |
|
|
|
168 |
|
|
static void stat_away(int slot) |
169 |
|
|
{ |
170 |
|
|
arr[slot].aways++; |
171 |
|
|
arr[slot].timeoutside -= curtime(); |
172 |
|
|
} |
173 |
|
|
|
174 |
|
|
static void stat_ret(int slot) |
175 |
|
|
{ |
176 |
|
|
arr[slot].rets++; |
177 |
|
|
arr[slot].timeoutside += curtime(); |
178 |
|
|
} |
179 |
|
|
|
180 |
|
|
void profiler__asm_enter(void); |
181 |
|
|
void profiler__asm_exit(void); |
182 |
|
|
|
183 |
|
|
#define A __asm__ |
184 |
|
|
|
185 |
|
|
/* |
186 |
|
|
* that't really weird but compatible with both gcc2 and gcc3 |
187 |
|
|
* |
188 |
|
|
* things i don't want to care of |
189 |
|
|
* - what size on stack pusha/pops use |
190 |
|
|
* - what current function framing is |
191 |
|
|
*/ |
192 |
|
|
void profiler__asm_enter_stub(void) |
193 |
|
|
{ |
194 |
|
|
A(" .globl .mcount "); |
195 |
|
|
A(" .globl profiler__asm_enter "); |
196 |
|
|
A("profiler__asm_enter: "); |
197 |
|
|
A(".mcount: "); |
198 |
|
|
A(" pushl %eax "); /* save %eax */ |
199 |
|
|
A(" movl %esp,%eax "); /* %eax = old %esp - 4 */ |
200 |
|
|
A(" pusha "); /* save all registers */ |
201 |
|
|
A(" push %eax "); /* push parameter to stack */ |
202 |
|
|
A(" call profiler__c_enter "); /* call c routine */ |
203 |
|
|
A(" pop %eax "); /* clear parameter from stack */ |
204 |
|
|
A(" popa "); /* restore all registers */ |
205 |
|
|
A(" pop %eax "); /* restore %eax */ |
206 |
|
|
A(" ret "); /* return */ |
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
void profiler__asm_exit_stub(void) |
210 |
|
|
{ |
211 |
|
|
A("profiler__asm_exit: "); |
212 |
|
|
A(" pushl $0xdeadbeaf "); /* placeholder to return address */ |
213 |
|
|
A(" pushl %eax "); /* save %eax */ |
214 |
|
|
A(" movl %esp,%eax "); /* %eax = addr of placeholder - 4 */ |
215 |
|
|
A(" pusha "); /* save all registers */ |
216 |
|
|
A(" pushl %eax "); /* push parameter to stack */ |
217 |
|
|
A(" call profiler__c_exit "); /* call C routine */ |
218 |
|
|
A(" popl %eax "); /* clear parameter from stack */ |
219 |
|
|
A(" popa "); /* restore all registers */ |
220 |
|
|
A(" popl %eax "); /* restore %eax */ |
221 |
|
|
A(" ret "); /* return */ |
222 |
|
|
} |
223 |
|
|
|
224 |
|
|
void profiler__c_enter(int *sp_1) |
225 |
|
|
{ |
226 |
|
|
unsigned char *pc; |
227 |
|
|
int stdepth = 2, i, thr, slot, gcc2 = 1; |
228 |
|
|
|
229 |
|
|
if (sizeof(int) != 4) |
230 |
|
|
core(); /* sizeof int != 4 */ |
231 |
|
|
if (sizeof(long long) != 8) |
232 |
|
|
core(); /* sizeof long long != 8 */ |
233 |
|
|
if (sizeof(void *) != 4) |
234 |
|
|
core(); /* sizeof pointer != 4 */ |
235 |
|
|
|
236 |
|
|
pc = (char *)(sp_1[1]); |
237 |
|
|
|
238 |
|
|
pc -= 5; |
239 |
|
|
if (*pc != 0xe8) /* call <relative> */ |
240 |
|
|
core(); /* called not by 0xe8 */ |
241 |
|
|
if ((int)pc + 5 + *(int *)(pc+1) != (int)profiler__asm_enter) |
242 |
|
|
core(); /* call points not to .mcount */ |
243 |
|
|
|
244 |
|
|
if (pc[-1] == 0x53) /* push %ebx */ |
245 |
|
|
pc--, stdepth++; |
246 |
|
|
if (pc[-1] == 0x56) /* push %esi */ |
247 |
|
|
pc--, stdepth++; |
248 |
|
|
if (pc[-1] == 0x57) /* push %edi */ |
249 |
|
|
pc--, stdepth++; |
250 |
|
|
if (pc[-1] == 0x55) /* push %ebp */ |
251 |
|
|
pc--, stdepth++; |
252 |
|
|
|
253 |
|
|
if (pc[-6]==0x81 && pc[-5]==0xec && pc[-2]==0 && pc[-1]==0) { |
254 |
|
|
/* sub <dword>,%esp */ |
255 |
|
|
stdepth += *(int *)(pc - 4)/4; |
256 |
|
|
pc -= 6; |
257 |
|
|
} else if (pc[-3]==0x83 && pc[-2]==0xec && pc[-1]%4==0) { |
258 |
|
|
/* sub <byte>,%esp */ |
259 |
|
|
stdepth += pc[-1]/4; |
260 |
|
|
pc -= 3; |
261 |
|
|
} else |
262 |
|
|
gcc2 = 0; |
263 |
|
|
|
264 |
|
|
while (pc[-1] >= 0x50 && pc[-1] <= 0x57) /* push %e[reg] */ |
265 |
|
|
pc--, stdepth++; |
266 |
|
|
|
267 |
|
|
/* "pushl %ebp; movl %esp,%ebp;" */ |
268 |
|
|
if (pc[-3]==0x55 && pc[-2]==0x89 && pc[-1]==0xe5) { |
269 |
|
|
stdepth++; |
270 |
|
|
pc -= 3; |
271 |
|
|
} else if(!gcc2) |
272 |
|
|
core(); /* unknown prologue, examine x/10i pc-10 */ |
273 |
|
|
|
274 |
|
|
/* |
275 |
|
|
* Now we know that it's standard prologue, so we modify the stack |
276 |
|
|
*/ |
277 |
|
|
thr = findthread(); |
278 |
|
|
slot = findaddr((int)pc); |
279 |
|
|
|
280 |
|
|
i = cstack[thr].depth++; |
281 |
|
|
if(i >= CSTACKSIZE) |
282 |
|
|
core(); /* call stack overflow */ |
283 |
|
|
|
284 |
|
|
cstack[thr].func[i] = slot; |
285 |
|
|
cstack[thr].ret[i] = sp_1[stdepth]; |
286 |
|
|
sp_1[stdepth] = (int)profiler__asm_exit; |
287 |
|
|
|
288 |
|
|
if (i > 0) |
289 |
|
|
stat_away(cstack[thr].func[i - 1]); |
290 |
|
|
stat_enter(slot); |
291 |
|
|
} |
292 |
|
|
|
293 |
|
|
void profiler__c_exit(int *sp) |
294 |
|
|
{ |
295 |
|
|
int i, thr; |
296 |
|
|
|
297 |
|
|
thr = findthread(); |
298 |
|
|
i = --cstack[thr].depth; |
299 |
|
|
if (i < 0) |
300 |
|
|
core(); /* call stack underflow */ |
301 |
|
|
sp[1] = cstack[thr].ret[i]; |
302 |
|
|
|
303 |
|
|
stat_exit(cstack[thr].func[i]); |
304 |
|
|
if (i > 0) |
305 |
|
|
stat_ret(cstack[thr].func[i - 1]); |
306 |
|
|
#if THREADED |
307 |
|
|
else |
308 |
|
|
cstack[thr].tid = 0; /* free this stack */ |
309 |
|
|
#endif |
310 |
|
|
} |
311 |
|
|
|
312 |
|
|
#ifndef PROFILE_FILE |
313 |
|
|
#error define PROFILE_FILE where to save statistic |
314 |
|
|
#endif |
315 |
|
|
|
316 |
|
|
static void mywrite(int fd,char *str) |
317 |
|
|
{ |
318 |
|
|
int len, i; |
319 |
|
|
|
320 |
|
|
for (len = strlen(str); len > 0; str += i, len -= i) |
321 |
|
|
if ((i = write(fd,str,len)) < 0) |
322 |
|
|
return; |
323 |
|
|
} |
324 |
|
|
|
325 |
|
|
void profiler_savestat(void) |
326 |
|
|
{ |
327 |
|
|
int i, fd; |
328 |
|
|
char buf[1024]; |
329 |
|
|
|
330 |
|
|
fd = open(PROFILE_FILE,O_CREAT | O_TRUNC | O_WRONLY,0666); |
331 |
|
|
if (fd < 0) { |
332 |
|
|
mywrite(2,"open " PROFILE_FILE " failed - "); |
333 |
|
|
mywrite(2,strerror(errno)); |
334 |
|
|
mywrite(2,"\n"); |
335 |
|
|
return; |
336 |
|
|
} |
337 |
|
|
|
338 |
|
|
snprintf(buf,sizeof(buf),"\nProfiling statistic %s at time %lld:\n" |
339 |
|
|
"\n%8s %10s %10s %10s %10s %20s %20s\n",PROFILE_FILE,curtime(), |
340 |
|
|
"Function","Enters","Exits","Aways","Returns", |
341 |
|
|
"Cycles_Total","Cycles_Inside"); |
342 |
|
|
mywrite(fd,buf); |
343 |
|
|
|
344 |
|
|
for (i = 0; i < FUNCSMAX; i++) |
345 |
|
|
if (arr[i].addr) { |
346 |
|
|
snprintf(buf,sizeof(buf),"%08x %10d %10d %10d %10d %20lld %20lld\n", |
347 |
|
|
arr[i].addr,arr[i].enters,arr[i].exits, |
348 |
|
|
arr[i].aways,arr[i].rets, |
349 |
|
|
arr[i].timetotal + (arr[i].enters-arr[i].exits) * curtime(), |
350 |
|
|
arr[i].timetotal - arr[i].timeoutside + curtime() * |
351 |
|
|
(arr[i].enters-arr[i].exits-arr[i].aways+arr[i].rets)); |
352 |
|
|
mywrite(fd,buf); |
353 |
|
|
} |
354 |
|
|
|
355 |
|
|
close(fd); |
356 |
|
|
} |