1 |
2005-05-31: Idea about reasonably fast emulation using Dynamic Translation, |
2 |
but _NOT_ binary translation. (So there is no need for individual assembly |
3 |
language backends.) |
4 |
|
5 |
I got the inspiration for this when geist (in #osdev on Freenode) said |
6 |
that he had a 10-instruction overhead per emulated instruction in his |
7 |
emulator. I went to sleep with that statement ringing in my mind, and woke |
8 |
up a few hours later. Depending on how you count, it seems to be possible |
9 |
to get down to as few as 5+n+1 instructions overhead on i386, per emulated |
10 |
instruction, where n is the number of instructions it takes to do the |
11 |
actual work (for example 7 for a simple "add"-like instruction). |
12 |
|
13 |
(On Alpha, it's about 8+n+1, or 7+n+1 if you skip an alignment-unop. |
14 |
Also, on i386, a variant with 6+n+1 instructions gives better performance |
15 |
than 5+n+1, so this is probably best to leave for the compiler to |
16 |
optimize.) |
17 |
|
18 |
------------------------------------------------------------------------------- |
19 |
|
20 |
Initial tests: a full page of 1024 add instructions followed by a return |
21 |
to the start of the page gives the following result: |
22 |
|
23 |
2.8 GHz Xeon: 168 MIPS (16.66 cycles per emulated instruction) |
24 |
[ 6 instrs in the main loop + 7 instrs for the add |
25 |
+ 1 instr for the return from the add function = 14. ] |
26 |
with gcc -O3 -fomit-frame-pointer |
27 |
|
28 |
533 MHz pca56: 14.6 MIPS (36.3 cycles per emulated instruction) |
29 |
[ 7 instrs in the main loop + 7 instrs for the add |
30 |
+ 1 instr for the return from the add function |
31 |
+ 1 unop for alignment in the main loop = 16 instrs. ] |
32 |
with ccc -fast -O4 |
33 |
|
34 |
(see new_test_1.c) |
35 |
|
36 |
------------------------------------------------------------------------------- |
37 |
|
38 |
run_one_instruction(struct cpu *cpu) |
39 |
{ |
40 |
/* |
41 |
* Get the instruction to execute, and advance the |
42 |
* program counter: |
43 |
* |
44 |
* Actually, the program counter itself isn't increased here. |
45 |
* cpu->next_instr_call can be seen as an offset into the |
46 |
* current "page". cpu->current_page can be a pointer to that |
47 |
* page. So by taking |
48 |
* |
49 |
* ((size_t)cpu->next_instr_call - (size_t)cpu->current_page |
50 |
* ) / sizeof(struct instr_call) |
51 |
* |
52 |
* we get the lowest bits of the program counter. This is |
53 |
* only necessary for jumps and at the end of a translated |
54 |
* page. |
55 |
*/ |
56 |
struct instr_call *ic = cpu->next_instr_call; |
57 |
cpu->next_instr_call ++; |
58 |
ic->f(cpu, ic); |
59 |
|
60 |
Pseudo-code for Alpha: cpu is in a0. |
61 |
move a0, s0 ; save away a0 |
62 |
lop: |
63 |
lq a1, next_instr_call(a0) ; a1 is ic |
64 |
addq a1, 64, t1 ; t1 = a1 + sizeof(struct instr_call) |
65 |
sq t1, next_instr_call(a0) ; cpu->next_instr_call ++; |
66 |
|
67 |
lq t2, f(a1) ; t2 = ic->f |
68 |
jsr ra,(t2),0 ; call ic->f(cpu, ic); |
69 |
|
70 |
move s0, a0 ; restore a0 |
71 |
+ some fuss about the global pointer |
72 |
(goto lop) |
73 |
|
74 |
On i386, perhaps: |
75 |
; assuming ebx is cpu |
76 |
mov esi, [ebx + next_instr_call] ; esi = ic = cpu->next_ic.. |
77 |
add [ebx + next_instr_call], 32 ; cpu->next_instr_call ++; |
78 |
push esi ; push ic |
79 |
push ebx ; push cpu |
80 |
call [esi + f] ; ic->f |
81 |
pop ebx ; restore cpu pointer |
82 |
pop eax ; nonsense |
83 |
loop... |
84 |
|
85 |
/* |
86 |
* If the program counter is changed because of a jump or so, |
87 |
* then cpu->next_instr_call should have been updated by |
88 |
* the 'f' function. |
89 |
* |
90 |
* If there was an exception, it could simply have been set |
91 |
* to something outside of the array. |
92 |
* |
93 |
* If we reach the end of a "translated" page, then there |
94 |
* could be a special function there as well. |
95 |
*/ |
96 |
} |
97 |
|
98 |
f could be something like: |
99 |
|
100 |
f_add(struct cpu *cpu, struct instr_call *ic) |
101 |
{ |
102 |
int32_t *a = (int32_t *) ic->arg[0]; |
103 |
int32_t *b = (int32_t *) ic->arg[1]; |
104 |
int32_t *c = (int32_t *) ic->arg[2]; |
105 |
|
106 |
*a = (*b) + (*c); |
107 |
|
108 |
In pseudo-alpha assembler: a0=cpu, a1=ic |
109 |
ld t0, 8(a1) |
110 |
ld t1, 16(a1) |
111 |
ld t2, 24(a1) |
112 |
ld t3, 0(t1) |
113 |
ld t4, 0(t2) |
114 |
addl t3,t4, t5 |
115 |
sd t5, 0(t0) |
116 |
ret |
117 |
} |
118 |
|
119 |
The arguments in the instr_call struct should be set up specifically for |
120 |
each function. An "add", as seen in the example above, usually needs two |
121 |
pointers to source values in memory, and a destination. |
122 |
|
123 |
------------------------------------------------------------------------------- |
124 |
|
125 |
Things to think about: |
126 |
|
127 |
x) Exceptions: |
128 |
need to be detected by individual functions, and when |
129 |
detected, change cpu->next_instr_call to something which |
130 |
breaks out of the main loop. |
131 |
|
132 |
x) Single-stepping |
133 |
One solution is to have multiple run-loops. One which is |
134 |
used with single-stepping, and one for fast runs. |
135 |
|
136 |
x) End of page? What is a good page size? (It must be equal or |
137 |
less than an emulated hardware page, so maybe 4KB or less.) |
138 |
|
139 |
x) Default page = filled with entries of "this needs to be |
140 |
translated" function. (An optimization is to try to translate |
141 |
a few at a time, not just one, to minimize the number of |
142 |
calls/returns from the translator function.) |
143 |
|
144 |
x) Writes to a translated page should either invalidate the entire |
145 |
page's translations, or at least those entries that are |
146 |
written to. |
147 |
|
148 |
x) Common "combinations" of instructions: |
149 |
o) Doesn't work at the end of a page. |
150 |
o) The second (and third etc) of the instructions still |
151 |
has to be translated, but still, common instructions |
152 |
can be combined. |
153 |
|
154 |
x) Keeping track of the number of executed instructions: |
155 |
Any instruction which changes the execution flow, or at |
156 |
the end of a page, or if an exception occurs, can check |
157 |
what the program counter is and compare it to the last |
158 |
value where the number of instructions was known. This |
159 |
works for fixed-size ISAs such as MIPS, anyway. |
160 |
|
161 |
------------------------------------------------------------------------------- |
162 |
|
163 |
A variant for non-fixed-size-ISAs: |
164 |
|
165 |
o) The instr_call struct can contain a field which says how many |
166 |
bytes long the instruction was. |
167 |
|
168 |
struct instr_call *ic = cpu->next_instr_call; |
169 |
cpu->next_instr_call ++; |
170 |
ic->f(cpu, ic); |
171 |
|
172 |
must then be changed into |
173 |
|
174 |
struct instr_call *ic = cpu->next_instr_call; |
175 |
cpu->next_instr_call += id->instruction_length; |
176 |
ic->f(cpu, ic); |
177 |
|
178 |
At the end of the page, there must be more than one "end of page" |
179 |
entry, to account for the various possible instruction lengths. |
180 |
|
181 |
o) There has to be one translation entry for each _byte_ of code, |
182 |
not just for each possible instruction (say, every fourth byte |
183 |
for MIPS). (Another example would be m68k, where there would |
184 |
have to be a translation entry for every other byte of code.) |
185 |
|
186 |
------------------------------------------------------------------------------- |
187 |
|
188 |
An alternative would be to have the main run-loop look like this: |
189 |
(see new_test_2.c) |
190 |
|
191 |
for (;;) { |
192 |
ic = cpu->next_instr_call++; |
193 |
ic->f(ic); |
194 |
|
195 |
ic = cpu->next_instr_call++; |
196 |
ic->f(ic); |
197 |
|
198 |
/* .. */ |
199 |
} |
200 |
|
201 |
if ic contains a pointer to the cpu struct (for those functions that need |
202 |
it; a simple "add" doesn't, for example). |
203 |
|
204 |
This results in just 5 (!) instructions overhead per emulated instruction, |
205 |
plus the code for the specific instruction (for example 8 for a simple |
206 |
"add"). objdump -d shows that the main run-loop looks like this on i386, |
207 |
if no cpu struct argument is used: |
208 |
|
209 |
080485ba <r>: |
210 |
80485ba: 53 push %ebx |
211 |
80485bb: 83 ec 08 sub $0x8,%esp |
212 |
80485be: 8b 5c 24 10 mov 0x10(%esp,1),%ebx |
213 |
80485c2: 8b 43 08 mov 0x8(%ebx),%eax ! 1 |
214 |
80485c5: 8d 48 14 lea 0x14(%eax),%ecx ! 2 |
215 |
80485c8: 89 4b 08 mov %ecx,0x8(%ebx) ! 3 |
216 |
80485cb: 89 04 24 mov %eax,(%esp,1) ! 4 |
217 |
80485ce: ff 10 call *(%eax) ! 5 |
218 |
|
219 |
where the last 5 lines are then repeated for each inlined instruction call. |
220 |
|
221 |
However, initial experiments on both Alpha and i386 hosts indicate that |
222 |
this is _slower_ in practice than ic->f(cpu, ic), even when cpu is not used. |
223 |
|
224 |
So, since passing along cpu produces faster code, and since cpu often |
225 |
_will_ be used, then the first choice is better. |
226 |
|
227 |
------------------------------------------------------------------------------- |
228 |
|