1 |
dpavlin |
604 |
/* |
2 |
|
|
The Malete project - the Z39.2/Z39.50 database framework of OpenIsis. |
3 |
|
|
Version 0.9.x (patchlevel see file Version) |
4 |
|
|
Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org |
5 |
|
|
|
6 |
|
|
This library is free software; you can redistribute it and/or |
7 |
|
|
modify it under the terms of the GNU Lesser General Public |
8 |
|
|
License as published by the Free Software Foundation; either |
9 |
|
|
version 2.1 of the License, or (at your option) any later version. |
10 |
|
|
|
11 |
|
|
This library is distributed in the hope that it will be useful, |
12 |
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
14 |
|
|
See the GNU Lesser General Public License for more details. |
15 |
|
|
|
16 |
|
|
You should have received a copy of the GNU Lesser General Public |
17 |
|
|
License along with this library; if not, write to the Free Software |
18 |
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 |
|
|
|
20 |
|
|
see README for more information |
21 |
|
|
EOH */ |
22 |
|
|
|
23 |
|
|
/* |
24 |
|
|
$Id: cdx.c,v 1.9 2004/11/11 18:20:23 kripke Exp $ |
25 |
|
|
charset collation |
26 |
|
|
*/ |
27 |
|
|
|
28 |
|
|
#include "../core/core.h" |
29 |
|
|
|
30 |
|
|
/* |
31 |
|
|
We use |
32 |
|
|
- p codes 0,1,..p-1 for primaries (word and nonword) |
33 |
|
|
- v codes p..p+v-1 for secondary and tertiary level variants |
34 |
|
|
- m codes p+v..p+v+m-1 for maps (targets) |
35 |
|
|
|
36 |
|
|
We index code numbers to an array[p+v+m] of unsigned ints as code values. |
37 |
|
|
The highest bit is the nonword indicator. |
38 |
|
|
The lower nibble of the highest byte is the length of the code's byte sequence |
39 |
|
|
(for maps in #code values, else in # cleartext bytes). |
40 |
|
|
|
41 |
|
|
If the sequence does not fit into the lower 3 bytes, |
42 |
|
|
the value&0xffffff is the offset of the bytes. |
43 |
|
|
|
44 |
|
|
For primaries and variants, the bytes are cleartext. |
45 |
|
|
For maps, the bytes are code text (length given as #codes), |
46 |
|
|
where every code uses one byte, if p+v<=256, else two. |
47 |
|
|
|
48 |
|
|
With variants, we also have an array[p+v] of variant info. |
49 |
|
|
For a primary code 0..p-1, |
50 |
|
|
this is the number of secondary and tertiary variants, |
51 |
|
|
and the code of the first of the (sec+1)*(ter+1) variants - 1. |
52 |
|
|
|
53 |
|
|
For a variant code p..p+v-1, this is it's secondary and tertiary weights |
54 |
|
|
and the code of the associated primary. |
55 |
|
|
*/ |
56 |
|
|
|
57 |
|
|
typedef union { |
58 |
|
|
unsigned u; |
59 |
|
|
struct { |
60 |
|
|
#ifdef CPU_BIG_ENDIAN |
61 |
|
|
unsigned char hi; |
62 |
|
|
unsigned char c[3]; |
63 |
|
|
#else |
64 |
|
|
unsigned char c[3]; |
65 |
|
|
unsigned char hi; |
66 |
|
|
#endif |
67 |
|
|
} b; |
68 |
|
|
} Cv; |
69 |
|
|
|
70 |
|
|
typedef struct Var { /* variant info */ |
71 |
|
|
unsigned char sec; /* secondary variant */ |
72 |
|
|
unsigned char ter; /* tertiary variant */ |
73 |
|
|
unsigned short rel; /* code of related variant */ |
74 |
|
|
} Var; |
75 |
|
|
|
76 |
|
|
/* |
77 |
|
|
For the encoding, we map every byte to: |
78 |
|
|
- it's code number |
79 |
|
|
- the "word" bit |
80 |
|
|
- the index of a table of possible following bytes |
81 |
|
|
|
82 |
|
|
To cover the BMP with UTF-8, we need 1072 tables: |
83 |
|
|
- 32 bytes 110* initiating a 2 byte sequence, |
84 |
|
|
using one table each |
85 |
|
|
- 16 bytes 1110* initiating a 3 byte sequence, |
86 |
|
|
each having a table with 64 second bytes 10*, |
87 |
|
|
using 16+16*64 = 1040 tables |
88 |
|
|
(- the 32 tables used for the UTF-16 surrogates D800-DFFF) |
89 |
|
|
CJK 3400-9FFF alone uses 432 tables (some 27.000 ideographs) |
90 |
|
|
*/ |
91 |
|
|
typedef struct Bin { /* byte info */ |
92 |
|
|
unsigned short cod; /* byte's collation code */ |
93 |
|
|
unsigned short tab; /* table */ |
94 |
|
|
} Bin; |
95 |
|
|
|
96 |
|
|
typedef Bin Bins [256]; /* full table */ |
97 |
|
|
|
98 |
|
|
typedef struct Tab { /* table info */ |
99 |
|
|
unsigned off; /* bytes offset of Bin for min */ |
100 |
|
|
unsigned char min; |
101 |
|
|
unsigned char max; |
102 |
|
|
unsigned short unu; |
103 |
|
|
} Tab; |
104 |
|
|
|
105 |
|
|
|
106 |
|
|
enum { |
107 |
|
|
BIT_VARIANTS = 0x20, |
108 |
|
|
BIT_FRENCH = 0x10, |
109 |
|
|
TYP_PLAIN = 0, |
110 |
|
|
TYP_VARIANTS = BIT_VARIANTS, /* TODO */ |
111 |
|
|
TYP_FRENCH = BIT_VARIANTS|BIT_FRENCH, /* TODO */ |
112 |
|
|
CVLAT1 = 104 /* # primary codes for builtin lat1cdx */ |
113 |
|
|
}; |
114 |
|
|
|
115 |
|
|
/* header of a dumped/mapped cx */ |
116 |
|
|
struct Cdx { |
117 |
|
|
unsigned char mag[3]; /* magic MCX or mcx for mapped cdx */ |
118 |
|
|
unsigned char typ; /* base type | bits per (primary) code-1 */ |
119 |
|
|
/* currently only 8(7) and 16(15) bits supported */ |
120 |
|
|
unsigned short pri; /* p # primary codes incl. 0,1 */ |
121 |
|
|
unsigned short var; /* p + v # variants */ |
122 |
|
|
unsigned short map; /* p+v + m # maps */ |
123 |
|
|
unsigned short tab; /* # non-root tables 1..tab */ |
124 |
|
|
/* redundant offsets */ |
125 |
|
|
unsigned ovi; /* offset variant_infos */ |
126 |
|
|
unsigned otp; /* offset table_pointers - 4 */ |
127 |
|
|
unsigned siz; /* total size */ |
128 |
|
|
Bins bt0; /* root table, always full */ |
129 |
|
|
unsigned cv[CVLAT1]; /* actually Cv code_values[map=p+v+m] */ |
130 |
|
|
/* Var variant_infos[var=p+v], if BIT_VARIANTS */ |
131 |
|
|
/* Tab table_pointers[tab] */ |
132 |
|
|
/* Bin byte_tables ... */ |
133 |
|
|
/* unsigned char *more_bytes */ |
134 |
|
|
}; |
135 |
|
|
|
136 |
|
|
|
137 |
|
|
static const char MAGIC[3] = |
138 |
|
|
#ifdef CPU_BIG_ENDIAN |
139 |
|
|
{'M','C','X'}; |
140 |
|
|
# define W(b) (0x01000000|(b)<<16) |
141 |
|
|
# define N(b) (0x81000000|(b)<<16) |
142 |
|
|
#else |
143 |
|
|
{'m','c','x'}; |
144 |
|
|
# define W(b) (0x01000000|(b)) |
145 |
|
|
# define N(b) (0x81000000|(b)) |
146 |
|
|
#endif |
147 |
|
|
|
148 |
|
|
#if 0 |
149 |
|
|
const Cdx lat1cdx = { |
150 |
|
|
{'s','t','a'}, /* mark as static */ |
151 |
|
|
TYP_PLAIN|7, CVLAT1, CVLAT1, CVLAT1, 0,0,0,0, |
152 |
|
|
{ /* byte infos */ |
153 |
|
|
#define B(b) {b,0} |
154 |
|
|
/* 32 C0 controls */ |
155 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
156 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
157 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
158 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
159 |
|
|
/* ! " # $ % & ' */ |
160 |
|
|
B( 2),B( 3),B( 4),B( 5),B( 6),B( 7),B( 8),B( 9), |
161 |
|
|
/* ( ) * + , - . / */ |
162 |
|
|
B(10),B(11),B(12),B(13),B(14),B(15),B(16),B(17), |
163 |
|
|
/* 0 1 2 3 4 5 6 7 */ |
164 |
|
|
B(18),B(19),B(20),B(21),B(22),B(23),B(24),B(25), |
165 |
|
|
/* 8 9 : ; < = > ? */ |
166 |
|
|
B(26),B(27),B(28),B(29),B(30),B(31),B(32),B(33), |
167 |
|
|
/* @ A B C D E F G */ |
168 |
|
|
B(34),B(35),B(36),B(37),B(38),B(39),B(40),B(41), |
169 |
|
|
B(42),B(43),B(44),B(45),B(46),B(47),B(48),B(49), /* H-O */ |
170 |
|
|
B(50),B(51),B(52),B(53),B(54),B(55),B(56),B(57), /* P-W */ |
171 |
|
|
/* X Y Z [ \ ] ^ _ */ |
172 |
|
|
B(58),B(59),B(60),B(61),B(62),B(63),B(64),B(65), |
173 |
|
|
/* ` a b c d e f g */ |
174 |
|
|
B(66),B(35),B(36),B(37),B(38),B(39),B(40),B(41), |
175 |
|
|
B(42),B(43),B(44),B(45),B(46),B(47),B(48),B(49), /* h-o */ |
176 |
|
|
B(50),B(51),B(52),B(53),B(54),B(55),B(56),B(57), /* p-w */ |
177 |
|
|
/* x y z { | } ~ DEL */ |
178 |
|
|
B(58),B(59),B(60),B(67),B(68),B(69),B(70),B(0), |
179 |
|
|
/* 32 C1 controls */ |
180 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
181 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
182 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
183 |
|
|
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
184 |
|
|
/* 32 mostly symbols */ |
185 |
|
|
B( 2),B(71),B(72),B(73),B(74),B(75),B(76),B(77), |
186 |
|
|
B(78),B(79),B(80),B(81),B(82),B(83),B(84),B(85), |
187 |
|
|
B(86),B(87),B(88),B(89),B(90),B(91),B(92),B(93), |
188 |
|
|
B(94),B(95),B(96),B(97),B(98),B(99),B(100),B(101), |
189 |
|
|
/* 64 Latin alphas including 2 symbols */ |
190 |
|
|
B(35),B(35),B(35),B(35),B(35),B(35),B(35),B(37), /* 7A,1C */ |
191 |
|
|
B(39),B(39),B(39),B(39),B(43),B(43),B(43),B(43), /* 4E,4I */ |
192 |
|
|
B(54),B(49),B(49),B(49),B(49),B(49),B(49),B(102), /* TN5O* */ |
193 |
|
|
B(49),B(55),B(55),B(55),B(55),B(59),B(54),B(53), /* O4UYTS */ |
194 |
|
|
B(35),B(35),B(35),B(35),B(35),B(35),B(35),B(37), /* 7a,1c */ |
195 |
|
|
B(39),B(39),B(39),B(39),B(43),B(43),B(43),B(43), /* 4e,4i */ |
196 |
|
|
B(54),B(49),B(49),B(49),B(49),B(49),B(49),B(103), /* tn5o% */ |
197 |
|
|
B(49),B(55),B(55),B(55),B(55),B(59),B(54),B(59) /* o4uyty */ |
198 |
|
|
#undef B |
199 |
|
|
}, |
200 |
|
|
{ /* code values, all using immediate bytes */ |
201 |
|
|
0, N(9), |
202 |
|
|
N(32),N(33),N(34),N(35),N(36),N(37),N(38),N(39), /* !"#$%&' */ |
203 |
|
|
N(40),N(41),N(42),N(43),N(44),N(45),N(46),N(47), /* ()*+,-./ */ |
204 |
|
|
W(48),W(49),W(50),W(51),W(52),W(53),W(54),W(55), /* 01234567 */ |
205 |
|
|
W(56),W(57),N(58),N(59),N(60),N(61),N(62),N(63), /* 89:;<=>? */ |
206 |
|
|
N(64),W(65),W(66),W(67),W(68),W(69),W(70),W(71), /* @ABCDEFG */ |
207 |
|
|
W(72),W(73),W(74),W(75),W(76),W(77),W(78),W(79), /* HIJKLMNO */ |
208 |
|
|
W(80),W(81),W(82),W(83),W(84),W(85),W(86),W(87), /* PQRSTUWW */ |
209 |
|
|
W(88),W(89),W(90),N(91),N(92),N(93),N(94),N(95), /* XYZ[\]^_ */ |
210 |
|
|
N(96), /*a-z*/ N(123),N(124),N(125),N(126), /* ` {|}~ */ |
211 |
|
|
N(161),N(162),N(163),N(164),N(165),N(166),N(167), |
212 |
|
|
N(168),N(169),N(170),N(171),N(172),N(173),N(174),N(175), |
213 |
|
|
N(176),N(177),N(178),N(179),N(180),N(181),N(182),N(183), |
214 |
|
|
N(184),N(185),N(186),N(187),N(188),N(189),N(190),N(191), |
215 |
|
|
N(215), N(247) |
216 |
|
|
} |
217 |
|
|
}; /* lat1cdx */ |
218 |
|
|
#endif |
219 |
|
|
|
220 |
|
|
|
221 |
|
|
/* |
222 |
|
|
encode l byte in b to key. |
223 |
|
|
key->len is the max len on input, resulting len on return. |
224 |
|
|
stop if max key len is hit or on word boundary, if words. |
225 |
|
|
return #used bytes in b. |
226 |
|
|
*/ |
227 |
|
|
int cEnc ( const Cdx *cdx, Key *key, unsigned char *b, int l, int words ) |
228 |
|
|
{ |
229 |
|
|
const unsigned char * const base = (unsigned char *)cdx; |
230 |
|
|
const Bin * const bt0 = cdx->bt0; |
231 |
|
|
const Tab * const tp = (Tab*)(base + cdx->otp); |
232 |
|
|
const int bits = 1+(0xf&cdx->typ), mapcodeshift = 256<cdx->var ? 1 : 0; |
233 |
|
|
const unsigned char *e = b+l; |
234 |
|
|
int bitsleft = 8*key->len, pfxlen = 0; |
235 |
|
|
int unassigned = 0; |
236 |
|
|
Bin seq[CDX_MAXSEQ], *top; /* current sequence */ |
237 |
|
|
unsigned u, len; |
238 |
|
|
unsigned short code; |
239 |
|
|
Cv cv; |
240 |
|
|
|
241 |
|
|
key->len = 0; |
242 |
|
|
for ( ; b < e && bits <= bitsleft; b++ ) { |
243 |
|
|
eRr(LOG_TRACE, "byte %c of %d bits %d/%d", *b, e-b, bitsleft, bits); |
244 |
|
|
/* traverse byte info tables to find longest matching sequence */ |
245 |
|
|
for (*(top = seq) = bt0[*b]; top->tab && b < e; ) { |
246 |
|
|
const Tab * const t = tp + top->tab; |
247 |
|
|
unsigned char n = b[1]; |
248 |
|
|
LOG_DBG(LOG_DEBUG, "table %d check %d %d-%d off %d %d", |
249 |
|
|
top->tab, n, t->min, t->max, t->off, *(int*)(base+t->off)); |
250 |
|
|
if ( n < t->min || n > t->max /* out of bounds */ |
251 |
|
|
|| !(u = ((unsigned*)(base+t->off))[n - t->min]) /* unassigned */ |
252 |
|
|
) |
253 |
|
|
break; |
254 |
|
|
*++top = *(Bin*)&u; |
255 |
|
|
b++; |
256 |
|
|
} |
257 |
|
|
for ( ; top > seq && !top->cod; top-- ) /* no complete sequence */ |
258 |
|
|
b--; |
259 |
|
|
code = top->cod; |
260 |
|
|
eRr(LOG_TRACE, "code %d", code); |
261 |
|
|
if (words) { |
262 |
|
|
if ( 1<words /* leave 1st words-1 bytes alone */ |
263 |
|
|
&& b+l-e >= words /* ok, passed it for the 1st time */ |
264 |
|
|
) { |
265 |
|
|
pfxlen = key->len; |
266 |
|
|
words = 1; |
267 |
|
|
} else if (0x80000000&cdx->cv[code]) { /* hit nonword */ |
268 |
|
|
if (pfxlen == key->len) /* skip to word */ |
269 |
|
|
continue; |
270 |
|
|
break; /* had something after prefix */ |
271 |
|
|
} |
272 |
|
|
} |
273 |
|
|
if (cdx->pri > code) { |
274 |
|
|
if ( code ) |
275 |
|
|
unassigned = 0; |
276 |
|
|
else { /* unassigned */ |
277 |
|
|
if ( unassigned ) |
278 |
|
|
continue; |
279 |
|
|
unassigned = 1; |
280 |
|
|
code = 1; |
281 |
|
|
} |
282 |
|
|
switch (bits) { |
283 |
|
|
case 16: key->byt[key->len++] = code >> 8; /* always bigend */ |
284 |
|
|
case 8: key->byt[key->len++] = (char)code; |
285 |
|
|
} |
286 |
|
|
bitsleft -= bits; |
287 |
|
|
continue; |
288 |
|
|
} |
289 |
|
|
/* if (cdx->var > top->cod) variant */ |
290 |
|
|
/* else map: */ |
291 |
|
|
cv.u = cdx->cv[code]; |
292 |
|
|
if ((len = (0xf & cv.b.hi)<<mapcodeshift)) { /* len = #codes */ |
293 |
|
|
/* mapped variants TODO */ |
294 |
|
|
if ( 0 > (bitsleft -= (len << 3)) ) |
295 |
|
|
break; |
296 |
|
|
memcpy(key->byt+key->len, ~3&len ? base+(0xffffff&cv.u) : cv.b.c, len); |
297 |
|
|
key->len += len; |
298 |
|
|
} |
299 |
|
|
} |
300 |
|
|
if (/*words &&*/ key->len == pfxlen) /* if pfxlen: found no words */ |
301 |
|
|
key->len = 0; |
302 |
|
|
|
303 |
|
|
return b+l-e; |
304 |
|
|
} /* cEnc */ |
305 |
|
|
|
306 |
|
|
|
307 |
|
|
int cDec ( const Cdx *cdx, unsigned char *b, int l, Key *key ) |
308 |
|
|
{ |
309 |
|
|
const unsigned char * const base = (unsigned char *)cdx; |
310 |
|
|
const unsigned char *k = key->byt, *e = b+l-CDX_MAXSEQ; |
311 |
|
|
const int bits = 1+(0xf&cdx->typ); |
312 |
|
|
int bitsleft = 8*key->len; |
313 |
|
|
Cv cv; |
314 |
|
|
|
315 |
|
|
for ( ; b < e && bits <= bitsleft; bitsleft -= bits ) { |
316 |
|
|
unsigned short v = *k++; |
317 |
|
|
if ( 16 == bits ) |
318 |
|
|
v = v<<8 | *k++; |
319 |
|
|
cv.u = cdx->cv[v]; |
320 |
|
|
switch ( cv.b.hi &= 0xf ) { /* discard word bit */ |
321 |
|
|
case 1: *b++ = cv.b.c[0]; continue; |
322 |
|
|
case 2: *b++ = cv.b.c[0]; *b++ = cv.b.c[1]; continue; |
323 |
|
|
case 3: *b++ = cv.b.c[0]; *b++ = cv.b.c[1]; *b++ = cv.b.c[2]; continue; |
324 |
|
|
} |
325 |
|
|
memcpy(b, base+(cv.u&0xffffff), cv.b.hi); |
326 |
|
|
b += cv.b.hi; |
327 |
|
|
} |
328 |
|
|
return b+l-CDX_MAXSEQ-e; |
329 |
|
|
} /* cDec */ |
330 |
|
|
|
331 |
|
|
|
332 |
|
|
|
333 |
|
|
typedef struct { |
334 |
|
|
Cdx cdx; |
335 |
|
|
unsigned cvb[0x10000-CVLAT1]; |
336 |
|
|
Var *vi, vib[0x10000]; |
337 |
|
|
Tab tp[1100]; |
338 |
|
|
Bins *bt, btb[1100]; |
339 |
|
|
unsigned char *p, b[200000]; |
340 |
|
|
} CdxMake; |
341 |
|
|
|
342 |
|
|
|
343 |
|
|
/* make byte table entries */ |
344 |
|
|
static int mapSeq (CdxMake *mk, unsigned char *p, int len, |
345 |
|
|
unsigned short code, int save) |
346 |
|
|
{ |
347 |
|
|
Bins *bt = &mk->cdx.bt0; |
348 |
|
|
LOG_DBG(LOG_DEBUG, "map '%.*s' -> %d", len, p, code); |
349 |
|
|
if (save) { |
350 |
|
|
Cv *cv = (Cv*)mk->cdx.cv+code; |
351 |
|
|
if (15 < len) { |
352 |
|
|
eRr(ERR_INVAL, "sequence '%.*s' too long %d", len, p, len); |
353 |
|
|
return 1; |
354 |
|
|
} |
355 |
|
|
cv->b.hi = (0x80&save) | len; |
356 |
|
|
if (!(~3 & len)) |
357 |
|
|
memcpy(cv->b.c, p, len); |
358 |
|
|
else { |
359 |
|
|
cv->u |= mk->p - mk->b; |
360 |
|
|
memcpy(mk->p, p, len); |
361 |
|
|
mk->p += len; |
362 |
|
|
} |
363 |
|
|
} |
364 |
|
|
for (;; p++) { |
365 |
|
|
Bin *b = *bt+*p; |
366 |
|
|
if (!--len) { /* last byte -- assign code */ |
367 |
|
|
if (b->cod) { |
368 |
|
|
eRr(ERR_INVAL, "attempt to reassign code %d for %d", code, b->cod); |
369 |
|
|
return 1; |
370 |
|
|
} |
371 |
|
|
b->cod = code; |
372 |
|
|
return 0; |
373 |
|
|
} |
374 |
|
|
if (b->tab) |
375 |
|
|
bt = mk->btb + (b->tab - 1); |
376 |
|
|
else { |
377 |
|
|
bt = mk->bt++; |
378 |
|
|
b->tab = mk->bt - mk->btb; |
379 |
|
|
LOG_DBG(LOG_DEBUG, "new table %d", b->tab); |
380 |
|
|
} |
381 |
|
|
} |
382 |
|
|
return 0; |
383 |
|
|
} |
384 |
|
|
|
385 |
|
|
/** compile collation src */ |
386 |
|
|
static Cdx *cMake ( const Fld *src ) |
387 |
|
|
{ |
388 |
|
|
const Fld *eof = REND(src), *f; |
389 |
|
|
CdxMake mk; |
390 |
|
|
int skipalias = 1, mapcodeshift; |
391 |
|
|
unsigned short firstcode = 1; /* of last row */ |
392 |
|
|
unsigned bins=0, off, u; |
393 |
|
|
Cdx *cdx; |
394 |
|
|
|
395 |
|
|
LOG_DBG(LOG_DEBUG, "cMake %d", sizeof mk); |
396 |
|
|
memset(&mk, 0, sizeof mk); |
397 |
|
|
mk.vi = mk.vib; |
398 |
|
|
mk.bt = mk.btb; |
399 |
|
|
mk.p = mk.b; |
400 |
|
|
mk.cdx.cv[0] = 0x80000000; /* 0: unassigned */ |
401 |
|
|
mk.cdx.cv[1] = 0x81000009; /* 1: tab */ |
402 |
|
|
mk.cdx.pri = 2; |
403 |
|
|
|
404 |
|
|
/* first pass: gather primaries and aliases */ |
405 |
|
|
for (f = src; ++f<eof; ) if (MET_COL == f->tag) { |
406 |
|
|
unsigned char *p = (unsigned char*)f->val, *e, *t, save; |
407 |
|
|
unsigned short code; |
408 |
|
|
|
409 |
|
|
if (3 > f->len || TAB != p[1]) continue; |
410 |
|
|
e = p + f->len; |
411 |
|
|
switch (*p) { |
412 |
|
|
case 'A': if (skipalias) continue; save = 0; goto alias; |
413 |
|
|
case 'W': save = 1; break; |
414 |
|
|
case 'N': save = 0x80; break; |
415 |
|
|
case 'S': |
416 |
|
|
case 'T': |
417 |
|
|
skipalias = 1; |
418 |
|
|
default: continue; |
419 |
|
|
} |
420 |
|
|
firstcode = mk.cdx.pri; |
421 |
|
|
skipalias = 0; |
422 |
|
|
alias: |
423 |
|
|
code = firstcode; |
424 |
|
|
for (p+=2; e > p; p = t+1) { |
425 |
|
|
if (!(t = memchr(p, TAB, e-p))) t = e; |
426 |
|
|
if (t > p && mapSeq(&mk, p, t-p, code, save)) return 0; |
427 |
|
|
if (save || code < mk.cdx.pri-1) code++; |
428 |
|
|
} |
429 |
|
|
if (save) mk.cdx.pri = code; |
430 |
|
|
} |
431 |
|
|
mk.cdx.typ = TYP_PLAIN|(256 < mk.cdx.pri ? 15 : 7); |
432 |
|
|
|
433 |
|
|
mk.cdx.var = mk.cdx.pri; /* second pass: gather variants */ |
434 |
|
|
for (f = src; ++f<eof; ) if (MET_COL == f->tag) { |
435 |
|
|
unsigned char *p = (unsigned char*)f->val; |
436 |
|
|
|
437 |
|
|
if (3 > f->len || TAB != p[1]) continue; |
438 |
|
|
switch (*p) { |
439 |
|
|
case 'S': |
440 |
|
|
case 'T': |
441 |
|
|
eRr(ERR_INVAL, "this version does not support multilevel"); |
442 |
|
|
return 0; |
443 |
|
|
} |
444 |
|
|
} |
445 |
|
|
if (mk.cdx.var > mk.cdx.pri) |
446 |
|
|
mk.cdx.typ |= BIT_VARIANTS; |
447 |
|
|
|
448 |
|
|
mk.cdx.map = mk.cdx.var; /* third pass: resolve maps */ |
449 |
|
|
mapcodeshift = 256<mk.cdx.var ? 1 : 0; |
450 |
|
|
for (f = src; ++f<eof; ) if (MET_COL == f->tag) { |
451 |
|
|
unsigned short code, codes[CDX_MAXSEQ]; |
452 |
|
|
unsigned char ccodes[CDX_MAXSEQ], *pcodes; |
453 |
|
|
unsigned char *p = (unsigned char*)f->val, *e, *t; |
454 |
|
|
unsigned short n = 0; |
455 |
|
|
Bin seq[CDX_MAXSEQ], *bin; /* current sequence */ |
456 |
|
|
Cv *cv; |
457 |
|
|
|
458 |
|
|
if (3 > f->len || 'M' != p[0] || TAB != p[1]) continue; |
459 |
|
|
e = p + f->len; |
460 |
|
|
p += 2; |
461 |
|
|
if (!(t = memchr(p, TAB, e-p))) continue; |
462 |
|
|
|
463 |
|
|
for ( ; p<t && n<CDX_MAXSEQ; p++ ) { /* get n codes for map */ |
464 |
|
|
for (*(bin = seq) = mk.cdx.bt0[*p]; bin->tab && p < t;) { |
465 |
|
|
/* slightly simpler than in cEnc, since we have full tables */ |
466 |
|
|
int *ent = (int*)mk.btb[bin->tab-1]; |
467 |
|
|
if (!ent[p[1]]) break; /* unassigned */ |
468 |
|
|
*++bin = *(Bin*)(ent + *++p); |
469 |
|
|
if (seq+CDX_MAXSEQ == bin) break; |
470 |
|
|
} |
471 |
|
|
for (; bin>seq && (!bin->cod || mk.cdx.var<=bin->cod); bin--) |
472 |
|
|
p--; |
473 |
|
|
if (mk.cdx.var>bin->cod) /* ignore map codes */ |
474 |
|
|
codes[n++] = bin->cod ? bin->cod : 1; |
475 |
|
|
} |
476 |
|
|
p = t+1; |
477 |
|
|
|
478 |
|
|
/* got n codes ... */ |
479 |
|
|
if (1 == n) /* single code ~ alias */ |
480 |
|
|
code = codes[0]; |
481 |
|
|
else { /* make entry cv[mk.cdx.map++] (even for n=0) */ |
482 |
|
|
if (mapcodeshift) { |
483 |
|
|
pcodes = (unsigned char*)codes; /* native */ |
484 |
|
|
#ifndef CPU_BIG_ENDIAN |
485 |
|
|
{ unsigned short *us = codes+n; |
486 |
|
|
while (us-- > codes) *us = (*us>>8) | (*us<<8); /* mk be */ |
487 |
|
|
} |
488 |
|
|
#endif |
489 |
|
|
} else { |
490 |
|
|
for (u=n; u--;) ccodes[u] = codes[u]; |
491 |
|
|
pcodes = ccodes; |
492 |
|
|
} |
493 |
|
|
cv = (Cv*)mk.cdx.cv + mk.cdx.map; |
494 |
|
|
cv->b.hi = n; |
495 |
|
|
if (n && 0x80000000&mk.cdx.cv[codes[0]]) /* inherit from 1st code */ |
496 |
|
|
cv->b.hi |= 0x80; |
497 |
|
|
n <<= mapcodeshift; |
498 |
|
|
if (!(~3 & n)) |
499 |
|
|
memcpy(cv->b.c, pcodes, n); |
500 |
|
|
else { |
501 |
|
|
cv->u |= mk.p - mk.b; |
502 |
|
|
memcpy(mk.p, pcodes, n); |
503 |
|
|
mk.p += n; |
504 |
|
|
} |
505 |
|
|
code = mk.cdx.map++; |
506 |
|
|
} |
507 |
|
|
for (; e > p; p = t+1) { /* map 'em all to mk.cdx.map */ |
508 |
|
|
if (!(t = memchr(p, TAB, e-p))) t = e; |
509 |
|
|
if (t > p && mapSeq(&mk, p, t-p, code, 0)) return 0; |
510 |
|
|
} |
511 |
|
|
} |
512 |
|
|
/* finish: compact */ |
513 |
|
|
mk.cdx.tab = mk.bt - mk.btb; |
514 |
|
|
mk.cdx.otp = |
515 |
|
|
mk.cdx.ovi = (((char *)&((Cdx*)0)->cv) - (char*)0) /* offsetoff(cv) */ |
516 |
|
|
+ mk.cdx.map*sizeof (unsigned); |
517 |
|
|
if (mk.cdx.var > mk.cdx.pri) |
518 |
|
|
mk.cdx.otp += mk.cdx.var * sizeof (Var); |
519 |
|
|
off = mk.cdx.otp + mk.cdx.tab * sizeof (Tab); |
520 |
|
|
for (u=0; u<mk.cdx.tab; u++) /* compact tables */ { |
521 |
|
|
int *base = (int *)(mk.btb + u), *pi = base; |
522 |
|
|
mk.tp[u].off = off; |
523 |
|
|
while (!*pi) pi++; |
524 |
|
|
LOG_DBG(LOG_DEBUG, "found code %x", *pi); |
525 |
|
|
mk.tp[u].min = pi - base; |
526 |
|
|
for (pi = base+256; !*--pi; ) ; |
527 |
|
|
mk.tp[u].max = pi - base; |
528 |
|
|
bins += 1+mk.tp[u].max-mk.tp[u].min; |
529 |
|
|
off += (1+mk.tp[u].max-mk.tp[u].min)*sizeof (Bin); |
530 |
|
|
LOG_DBG(LOG_DEBUG, "table %d %d-%d", u+1, mk.tp[u].min, mk.tp[u].max); |
531 |
|
|
} |
532 |
|
|
mk.cdx.siz = off + (mk.p - mk.b); |
533 |
|
|
eRr(LOG_INFO, |
534 |
|
|
"%d primaries %d variants %d maps %d tables %d bins, off %d size %d", |
535 |
|
|
mk.cdx.pri, mk.cdx.var-mk.cdx.pri, mk.cdx.map-mk.cdx.var, mk.cdx.tab, |
536 |
|
|
bins, off, mk.cdx.siz); |
537 |
|
|
/* adjust indirect CVs by off */ { |
538 |
|
|
unsigned i = 0, mask = 0x0c000000; |
539 |
|
|
for (; i < mk.cdx.var; i++) |
540 |
|
|
if (0x0c000000 & mk.cdx.cv[i]) |
541 |
|
|
mk.cdx.cv[i] += off; |
542 |
|
|
if (mapcodeshift) mask |= 0x02000000; /* only 1 code fits */ |
543 |
|
|
for (; i < mk.cdx.map; i++) |
544 |
|
|
if (mask & mk.cdx.cv[i]) |
545 |
|
|
mk.cdx.cv[i] += off; |
546 |
|
|
} |
547 |
|
|
cdx = (Cdx*)mAlloc(mk.cdx.siz); |
548 |
|
|
memcpy(cdx, &mk.cdx, mk.cdx.ovi); |
549 |
|
|
if (cdx->var > cdx->pri) |
550 |
|
|
memcpy((char*)cdx + cdx->ovi, mk.vib, mk.cdx.var*sizeof (Var)); |
551 |
|
|
if (cdx->tab) { |
552 |
|
|
memcpy((char*)cdx + cdx->otp, mk.tp, mk.cdx.tab*sizeof (Tab)); |
553 |
|
|
for (u=0; u<mk.cdx.tab; u++) |
554 |
|
|
memcpy((char*)cdx + mk.tp[u].off, |
555 |
|
|
mk.btb[u] + mk.tp[u].min, |
556 |
|
|
(1+mk.tp[u].max-mk.tp[u].min)*sizeof (Bin)); |
557 |
|
|
} |
558 |
|
|
memcpy((char*)cdx + off, mk.b, mk.p - mk.b); |
559 |
|
|
cdx->otp -= sizeof (Tab); /* adjust 0 based */ |
560 |
|
|
|
561 |
|
|
return cdx; |
562 |
|
|
} /* cMake */ |
563 |
|
|
|
564 |
|
|
|
565 |
|
|
|
566 |
|
|
/* |
567 |
|
|
list of open shared cdx |
568 |
|
|
*/ |
569 |
|
|
typedef struct Foo Foo; |
570 |
|
|
|
571 |
|
|
struct Foo { |
572 |
|
|
char nln; /* namelen */ |
573 |
|
|
char nam[31]; |
574 |
|
|
Foo *foo; |
575 |
|
|
int ref; |
576 |
|
|
Cdx *cdx; |
577 |
|
|
FMap fm; |
578 |
|
|
}; |
579 |
|
|
#if 0 |
580 |
|
|
static Foo lat1foo = { 7, "Latin-1", 0, 1, (Cdx*)&lat1cdx }; |
581 |
|
|
static Foo *fools = &lat1foo; |
582 |
|
|
#endif |
583 |
|
|
static Foo *fools = 0; |
584 |
|
|
|
585 |
|
|
|
586 |
|
|
|
587 |
|
|
/** open or compile collation src */ |
588 |
|
|
const Cdx *cOpen (const Fld *src) |
589 |
|
|
{ |
590 |
|
|
char *nam = 0, *p; |
591 |
|
|
int nln = 0, coldef = 0; |
592 |
|
|
Foo *foo = fools; |
593 |
|
|
Cdx *cdx = 0; |
594 |
|
|
const Fld *eof, *f; |
595 |
|
|
char fname[20]; |
596 |
|
|
FMap fm; |
597 |
|
|
|
598 |
|
|
if (src) |
599 |
|
|
for (f = src, eof = REND(src); ++f < eof; ) |
600 |
|
|
if (MET_COL == f->tag) { |
601 |
|
|
coldef = 1; |
602 |
|
|
if (2 < f->len && 'C' == f->val[0] |
603 |
|
|
&& TAB == f->val[1] && TAB != f->val[2] |
604 |
|
|
) { /* named */ |
605 |
|
|
nam = f->val+2; |
606 |
|
|
nln = f->len-2; |
607 |
|
|
if ((p = memchr(nam, TAB, nln))) nln = p-nam; |
608 |
|
|
if (nln > 15) nln = 15; |
609 |
|
|
LOG_DBG(LOG_DEBUG, "collation name '%.*s'", nln, nam); |
610 |
|
|
for (; foo; foo = foo->foo) |
611 |
|
|
if (nln == foo->nln && !memcmp(nam, foo->nam, nln)) { /* got it */ |
612 |
|
|
foo->ref++; /* ref it */ |
613 |
|
|
return foo->cdx; /* ret it */ |
614 |
|
|
} |
615 |
|
|
/* TODO: try to map -- check type */ |
616 |
|
|
memset(fname, 0, sizeof fname); |
617 |
|
|
memcpy(fname, nam, nln); |
618 |
|
|
memcpy(fname+nln, ".mcx", 5); |
619 |
|
|
memset(&fm, 0, sizeof fm); |
620 |
|
|
fm.fil = FIL_NONE; |
621 |
|
|
if (!fMOpen(&fm, fname, FIL_RD)) { |
622 |
|
|
int size = fSize(fm.fil); |
623 |
|
|
fm.lim = (size + env.psz-1)>>env.psh; |
624 |
|
|
if ( (int)fm.lim != fMap(&fm, fm.lim) |
625 |
|
|
|| memcmp(MAGIC, fm.map, 3) |
626 |
|
|
|| size != (int)((Cdx*)fm.map)->siz |
627 |
|
|
) { |
628 |
|
|
eRr(ERR_TRASH, "bad coll file '%s'", fname); |
629 |
|
|
fMClose(&fm); |
630 |
|
|
} else { |
631 |
|
|
cdx = (Cdx*)fm.map; |
632 |
|
|
eRr(LOG_VERBOSE, "mapped coll '%s' %d bytes", fname, size); |
633 |
|
|
#ifndef WIN32 |
634 |
|
|
fClose(&fm.fil); /* don't need the file open */ |
635 |
|
|
#endif |
636 |
|
|
} |
637 |
|
|
} |
638 |
|
|
} |
639 |
|
|
} |
640 |
|
|
if (!cdx && (!coldef || !(cdx = cMake(src)))) |
641 |
|
|
return 0; |
642 |
|
|
if (nam) { /* foo an mmaped or named made */ |
643 |
|
|
Foo *newfoo = mAlloc(sizeof *newfoo); |
644 |
|
|
memcpy(newfoo->nam, nam, newfoo->nln = nln); |
645 |
|
|
newfoo->foo = fools; |
646 |
|
|
newfoo->ref = 1; |
647 |
|
|
newfoo->cdx = cdx; |
648 |
|
|
if (fm.map) /* was mapped */ |
649 |
|
|
newfoo->fm = fm; |
650 |
|
|
else { /* try to write */ |
651 |
|
|
file fil = FIL_NONE; |
652 |
|
|
if (!fOpen(&fil, fname, FIL_WR|FIL_CREAT|FIL_TRUNC)) { |
653 |
|
|
memcpy(cdx->mag, MAGIC, 3); |
654 |
|
|
fWrite(&fil, cdx, cdx->siz); |
655 |
|
|
fClose(&fil); |
656 |
|
|
eRr(LOG_INFO, "saved coll '%s' %d bytes", fname, cdx->siz); |
657 |
|
|
} |
658 |
|
|
memcpy(cdx->mag, "nam", 3); /* yet we keep using our selfmade copy */ |
659 |
|
|
} |
660 |
|
|
fools = newfoo; |
661 |
|
|
} |
662 |
|
|
return cdx; |
663 |
|
|
} /* cOpen */ |
664 |
|
|
|
665 |
|
|
|
666 |
|
|
void cClose (Cdx *cdx) |
667 |
|
|
{ |
668 |
|
|
Foo *foo, *f = 0; |
669 |
|
|
switch (cdx->mag[0]) { |
670 |
|
|
case 0: /* internal anonymous */ |
671 |
|
|
mFree(cdx); |
672 |
|
|
case 's': /* internal static */ |
673 |
|
|
return; |
674 |
|
|
#ifdef CPU_BIG_ENDIAN |
675 |
|
|
case 'M': |
676 |
|
|
#else |
677 |
|
|
case 'm': /* mapped - always named */ |
678 |
|
|
#endif |
679 |
|
|
case 'n': /* internal named */ |
680 |
|
|
for (foo = fools; foo; foo = (f = foo)->foo) |
681 |
|
|
if (foo->cdx == cdx) { |
682 |
|
|
if (!--foo->ref) { |
683 |
|
|
if ('n' == cdx->mag[0]) |
684 |
|
|
mFree(cdx); |
685 |
|
|
else |
686 |
|
|
fMClose(&foo->fm); |
687 |
|
|
if (f) |
688 |
|
|
f->foo = foo->foo; |
689 |
|
|
else |
690 |
|
|
fools = foo->foo; |
691 |
|
|
mFree(foo); |
692 |
|
|
} |
693 |
|
|
return; |
694 |
|
|
} |
695 |
|
|
/* panic time ? */ |
696 |
|
|
} |
697 |
|
|
} /* cClose */ |