1 |
dpavlin |
42 |
/* |
2 |
|
|
Idea 2007-06-06 on how to use a C compiler + linker as a native code generation backend. |
3 |
|
|
|
4 |
|
|
gcc native_cc_ld_test.i -Wall -O3 -fomit-frame-pointer -fpeephole -fno-builtin -c |
5 |
|
|
ld native_cc_ld_test.o -o native_cc_ld_test -e f -Ttext 0x1234560 |
6 |
|
|
|
7 |
|
|
objdump -d native_cc_ld_test |
8 |
|
|
|
9 |
|
|
The text part of that binary should then be easy to just copy directly into the |
10 |
|
|
translation cache. (The address 0x12340040 in the example is where I want the code |
11 |
|
|
fragment to end up in the cache.) It might even be possible to only do the cc step, |
12 |
|
|
and skip the ld step, if the code is position-independent. |
13 |
|
|
|
14 |
|
|
|
15 |
|
|
A couple of tricks are used: |
16 |
|
|
|
17 |
|
|
o) Note that the cpu and ic structs only contain just enough to mimic the cpu and ic |
18 |
|
|
structs in the emulator itself. The dummy fillers are there to make sure that |
19 |
|
|
the interesting fields (reg, next_ic, and ninstrs) end up at the correct offsets. |
20 |
|
|
|
21 |
|
|
o) No #include directives are needed, if reasonable types are used (int, unsigned long long, |
22 |
|
|
etc). These will have to be detected before running the compiler. Also, this makes it |
23 |
|
|
possible to skip the preprocessor, i.e. output a .i file instead of a .c file. |
24 |
|
|
|
25 |
|
|
o) Values in the cpu struct that are used are first loaded into local variables, used, |
26 |
|
|
and then stored back before any kind of return path (e.g. in a generic load/store, |
27 |
|
|
or at the end of the function, or on a non-samepage branch). |
28 |
|
|
|
29 |
|
|
o) Delay slots are handled by setting a "condition", then executing the next instruction, |
30 |
|
|
then branching. TODO: How about instructions in delay slots which may cause exceptions? |
31 |
|
|
|
32 |
|
|
o) Samepage-branches can be implemented using C labels (goto). |
33 |
|
|
|
34 |
|
|
|
35 |
|
|
Good: |
36 |
|
|
|
37 |
|
|
o) Somewhat portable. The same mechanism could be used for amd64, Alpha, MIPS, and most likely |
38 |
|
|
several other host architectures. |
39 |
|
|
|
40 |
|
|
o) A good optimizing compiler will generate very good code, probably much better code than |
41 |
|
|
I would be able to generate manually. |
42 |
|
|
|
43 |
|
|
|
44 |
|
|
Bad: |
45 |
|
|
|
46 |
|
|
o) Very high overhead. Calling cc + ld on my laptop takes 1/30th of a second, which is quite high. |
47 |
|
|
On my older Alpha workstation, it takes about 1/10th of a second. This means that the |
48 |
|
|
mechanism which desides whether or not to actually natively translate a block of code must |
49 |
|
|
take into account how much the overhead is vs how much time will be saved etc. |
50 |
|
|
|
51 |
|
|
*/ |
52 |
|
|
|
53 |
|
|
struct cpu; |
54 |
|
|
|
55 |
|
|
struct ic { |
56 |
|
|
void (*f)(struct cpu *, struct ic*); |
57 |
|
|
long arg[3]; |
58 |
|
|
}; |
59 |
|
|
|
60 |
|
|
struct cpu { |
61 |
|
|
char dummy[800]; |
62 |
|
|
int reg[32]; |
63 |
|
|
char dummy2[80]; |
64 |
|
|
struct ic* next_ic; |
65 |
|
|
char dummy3[120]; |
66 |
|
|
int ninstrs; |
67 |
|
|
void *host_load[1048576]; |
68 |
|
|
void *host_store[1048576]; |
69 |
|
|
}; |
70 |
|
|
|
71 |
|
|
void f(struct cpu *cpu, struct ic *ic) |
72 |
|
|
{ |
73 |
|
|
int cond0; |
74 |
|
|
void (*g0)(struct cpu *, struct ic *) = (void (*)(struct cpu *, struct ic *)) 0x123801234560ULL; |
75 |
|
|
|
76 |
|
|
unsigned int r2 = cpu->reg[2]; |
77 |
|
|
unsigned int r3 = cpu->reg[3]; |
78 |
|
|
unsigned int r4 = cpu->reg[4]; |
79 |
|
|
unsigned int r9 = cpu->reg[9]; |
80 |
|
|
|
81 |
|
|
unsigned int addr0; |
82 |
|
|
unsigned char *page0; |
83 |
|
|
|
84 |
|
|
unsigned int ninstrs = cpu->ninstrs; |
85 |
|
|
|
86 |
|
|
ninstrs --; |
87 |
|
|
|
88 |
|
|
L0: |
89 |
|
|
|
90 |
|
|
/* st.b r3,r0,r2 */ |
91 |
|
|
addr0 = r2; |
92 |
|
|
page0 = (unsigned char *) cpu->host_store[addr0 >> 12]; |
93 |
|
|
if (page0 == (void *)0) { |
94 |
|
|
cpu->reg[2] = r2; |
95 |
|
|
cpu->reg[3] = r3; |
96 |
|
|
cpu->reg[4] = r4; |
97 |
|
|
cpu->reg[9] = r9; |
98 |
|
|
cpu->ninstrs = ninstrs; |
99 |
|
|
g0(cpu, ic + 0); |
100 |
|
|
return; |
101 |
|
|
} |
102 |
|
|
|
103 |
|
|
page0[addr0 & 0xfff] = r3; |
104 |
|
|
|
105 |
|
|
ninstrs ++; |
106 |
|
|
|
107 |
|
|
/* addu r2,r2,1 */ |
108 |
|
|
r2 = r2 + 1; |
109 |
|
|
ninstrs ++; |
110 |
|
|
|
111 |
|
|
/* or r9,r0,r4 */ |
112 |
|
|
r9 = r4; |
113 |
|
|
ninstrs ++; |
114 |
|
|
|
115 |
|
|
/* bcnd.n gt0,r9,L0 */ |
116 |
|
|
/* subu r4, r4, 1 */ |
117 |
|
|
cond0 = (int)r9 > 0; |
118 |
|
|
r4 = r4 - 1; |
119 |
|
|
ninstrs += 2; |
120 |
|
|
if (cond0) |
121 |
|
|
goto L0; |
122 |
|
|
|
123 |
|
|
cpu->reg[2] = r2; |
124 |
|
|
cpu->reg[3] = r3; |
125 |
|
|
cpu->reg[4] = r4; |
126 |
|
|
cpu->reg[9] = r9; |
127 |
|
|
cpu->ninstrs = ninstrs; |
128 |
|
|
|
129 |
|
|
cpu->next_ic = ic + 5; |
130 |
|
|
} |