/[hyperestraier]/upstream/0.5.2/estraier.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /upstream/0.5.2/estraier.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 9 - (show annotations)
Wed Aug 3 15:21:15 2005 UTC (18 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 132086 byte(s)
import upstream version 0.5.2

1 /*************************************************************************************************
2 * Implementation of the core API
3 * Copyright (C) 2004-2005 Mikio Hirabayashi
4 * This file is part of Hyper Estraier.
5 * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6 * the GNU Lesser General Public License as published by the Free Software Foundation; either
7 * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10 * License for more details.
11 * You should have received a copy of the GNU Lesser General Public License along with Hyper
12 * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13 * Boston, MA 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #include "estraier.h"
18 #include "myconf.h"
19
20 #define ESTNUMBUFSIZ 32 /* size of a buffer for a number */
21 #define ESTPATHBUFSIZ 4096 /* size of a buffer for a path */
22 #define ESTIOBUFSIZ 8192 /* size of a buffer for I/O */
23 #define ESTALLOCUNIT 1024 /* unit number of memory allocation */
24 #define ESTMINIBNUM 31 /* bucket number of map for attributes */
25
26 #define ESTMETADBNAME "_meta" /* name of the meta database */
27 #define ESTKEYIDXNUM "_idxnum" /* key for the number of inverted indexes */
28 #define ESTKEYDSEQ "_dseq" /* key for the sequence for document IDs */
29 #define ESTKEYDNUM "_dnum" /* key for the number of documents */
30 #define ESTKEYAMODE "_amode" /* key for the mode of text analyzer */
31 #define ESTKEYMETA "_meta" /* key for meta data */
32
33 #define ESTIDXDBNAME "_idx" /* name of the inverted index */
34 #define ESTIDXDBLRM 77 /* records in a leaf node of the inverted index */
35 #define ESTIDXDBNIM 160 /* records in a non-leaf node of the inverted index */
36 #define ESTIDXDBLCN 16 /* number of leaf cache of the inverted index */
37 #define ESTIDXDBNCN 16 /* number of non-leaf cache of the inverted index */
38 #define ESTIDXDBRLCN 128 /* number of leaf cache of the index reader */
39 #define ESTIDXDBRNCN 64 /* number of non-leaf cache of the index reader */
40
41 #define ESTFWMDBNAME "_fwm" /* name of the database for forward matching */
42 #define ESTFWMDBLRM 111 /* records in a leaf node of forward matching DB */
43 #define ESTFWMDBNIM 110 /* records in a non-leaf node of forward matching DB */
44 #define ESTFWMDBLCN 32 /* number of leaf cache of forward matching DB */
45 #define ESTFWMDBNCN 16 /* number of non-leaf cache of forward matching DB */
46
47 #define ESTATTRDBNAME "_attr" /* name of the database for attrutes */
48 #define ESTATTRDBBNUM 122869 /* bucket number of the database for attrutes */
49 #define ESTATTRDBDNUM 3 /* division number of the database for attrutes */
50 #define ESTATTRDBALN -5 /* alignment of the database for attrutes */
51
52 #define ESTTEXTDBNAME "_text" /* name of the database of texts */
53 #define ESTTEXTDBBNUM 30713 /* bucket number of the database for texts */
54 #define ESTTEXTDBDNUM 7 /* division number of the database for texts */
55 #define ESTTEXTDBALN -5 /* alignment of the database for texts */
56
57 #define ESTLISTDBNAME "_list" /* name of the database of document list */
58 #define ESTLISTDBLRM 99 /* records in a leaf node of document list DB */
59 #define ESTLISTDBNIM 200 /* records in a non-leaf node of document list DB */
60 #define ESTLISTDBLCN 32 /* number of leaf cache of document list DB */
61 #define ESTLISTDBNCN 16 /* number of non-leaf cache of document list DB */
62
63 #define ESTIDXCCBNUM 524288 /* bucket number of cache for the inverted index */
64 #define ESTIDXCCMAX (1048576*64) /* max size of the cache */
65 #define ESTOUTCCBNUM 131072 /* bucket number of cache for deleted documents */
66 #define ESTKEYCCMNUM 65536 /* bucket number of cache for keys for TF-IDF */
67 #define ESTATTRCCMNUM 8192 /* number of cache for attributes */
68 #define ESTTEXTCCMNUM 1024 /* number of cache for texts */
69 #define ESTCCCBFREQ 10000 /* frequency of callback for flushing words */
70
71 #define ESTDIRMODE 00755 /* permission of a creating directory */
72 #define ESTICCHECKSIZ 32768 /* size of checking character code */
73 #define ESTICMISSMAX 256 /* allowance number of missing characters */
74 #define ESTICALLWRAT 0.001 /* allowance ratio of missing characters */
75 #define ESTZCOMPLEVEL 5 /* level of compression of zlib */
76 #define ESTOCPOINT 10 /* point per occurrence */
77 #define ESTJHASHNUM 251 /* hash number for a junction */
78 #define ESTWORDMAXLEN 48 /* maximum length of a word */
79 #define ESTWORDAVGLEN 8 /* average length of a word */
80 #define ESTKEYSCALW 4 /* allowance ratio of TF-IDF for keywords */
81 #define ESTMEMIRATIO 1.1 /* incremental ratio of memory allocation */
82
83 #define ESTSMLRKNUM 16 /* number of keywords to get candidates */
84 #define ESTSMLRUNUM 1024 /* number of adopted documents for a keyword */
85 #define ESTSMLRNMIN 0.5 /* the minimum value for narrowing */
86
87 enum { /* enumeration for character categories */
88 ESTSPACECHR, /* space characters */
89 ESTDELIMCHR, /* delimiter characters */
90 ESTWESTALPH, /* west alphabets */
91 ESTEASTALPH /* east alphabets */
92 };
93
94 enum { /* enumeration for text analizer modes */
95 ESTAMNORMAL, /* normal */
96 ESTAMPERFNG /* perfect N-gram */
97 };
98
99 typedef struct { /* type of structure for a hitting object */
100 int id; /* ID of a document */
101 int score; /* score tuned by TF-IDF */
102 char *value; /* value of an attribute for sorting */
103 } ESTSCORE;
104
105 typedef struct { /* type of structure for a conditional attribute */
106 char *name; /* name */
107 int nsiz; /* size of the name */
108 char *oper; /* operator */
109 char *val; /* value */
110 int vsiz; /* size of the value */
111 const char *cop; /* canonical operator */
112 int sign; /* positive or negative */
113 char *sval; /* value of small cases */
114 int ssiz; /* size of the small value */
115 time_t num; /* numeric value */
116 } ESTCATTR;
117
118 typedef struct { /* type of structure for a hitting object */
119 const char *word; /* face of keyword */
120 int wsiz; /* size of the keyword */
121 int pt; /* score tuned by TF-IDF */
122 } ESTKEYSC;
123
124
125 /* private function prototypes */
126 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
127 static void est_normalize_text(unsigned char *utext, int size, int *sp);
128 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
129 static int est_char_category(int c);
130 static int est_char_category_perfng(int c);
131 static char *est_phrase_from_thumb(const char *sphrase);
132 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
133 int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
134 static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
135 const unsigned char *needle, int nsiz);
136 static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
137 static int est_idx_close(ESTIDX *idx);
138 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum);
139 static void est_idx_increment(ESTIDX *idx);
140 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz);
141 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
142 static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp);
143 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
144 static int est_idx_num(ESTIDX *idx);
145 static int est_idx_size(ESTIDX *idx);
146 static int est_idx_sync(ESTIDX *idx);
147 static int est_idx_optimize(ESTIDX *idx);
148 static void est_idx_set_current(ESTIDX *idx);
149 static int est_db_write_meta(ESTDB *db);
150 static void est_db_inform(ESTDB *db, const char *info);
151 static int est_db_used_cache_size(ESTDB *db);
152 static void est_db_prepare_meta(ESTDB *db);
153 static CBLIST *est_phrase_terms(const char *phrase);
154 static int est_score_compare_by_id(const void *ap, const void *bp);
155 static int est_score_compare_by_score(const void *ap, const void *bp);
156 static int est_score_compare_by_str_asc(const void *ap, const void *bp);
157 static int est_score_compare_by_str_desc(const void *ap, const void *bp);
158 static int est_score_compare_by_num_asc(const void *ap, const void *bp);
159 static int est_score_compare_by_num_desc(const void *ap, const void *bp);
160 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
161 static void est_expand_word(ESTDB *db, const char *word, CBLIST *list);
162 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
163 int *nump, CBMAP *hints, int add);
164 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
165 ESTSCORE *scores, int snum);
166 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
167 const char *oval, int osiz, const char *sval, int ssiz, int onum);
168 static int est_keysc_compare(const void *ap, const void *bp);
169 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
170 int knum, int unum, int tfidf, double nmin);
171 static CBMAP *est_phrase_vector(const char *phrase);
172 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf);
173 static void est_set_svec(CBMAP *svmap, int *svec, int vnum);
174 static void est_set_tvec(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum);
175 static double est_vec_abs(const int *vec, int vnum);
176 static double est_vec_iprod(const int *avec, const int *bvec, int vnum);
177 static double est_vec_cos(const int *avec, const int *bvec, int vnum);
178 static void est_random_fclose(void);
179
180
181
182 /*************************************************************************************************
183 * common settings
184 *************************************************************************************************/
185
186
187 /* version of QDBM */
188 const char *est_version = _EST_VERSION;
189
190
191
192 /*************************************************************************************************
193 * API for document
194 *************************************************************************************************/
195
196
197 /* Create a document object. */
198 ESTDOC *est_doc_new(void){
199 ESTDOC *doc;
200 CB_MALLOC(doc, sizeof(ESTDOC));
201 doc->id = -1;
202 doc->attrs = NULL;
203 doc->dtexts = NULL;
204 return doc;
205 }
206
207
208 /* Create a document object made from draft data. */
209 ESTDOC *est_doc_new_from_draft(const char *draft){
210 ESTDOC *doc;
211 CBLIST *lines;
212 const char *line;
213 char *pv;
214 int i;
215 assert(draft);
216 doc = est_doc_new();
217 lines = cbsplit(draft, -1, "\n");
218 for(i = 0; i < CB_LISTNUM(lines); i++){
219 line = CB_LISTVAL(lines, i, NULL);
220 while(*line > '\0' && *line <= ' '){
221 line++;
222 }
223 if(line[0] == '\0'){
224 i++;
225 break;
226 }
227 if((pv = strchr(line, '=')) != NULL){
228 *(pv++) = '\0';
229 est_doc_add_attr(doc, line, pv);
230 }
231 }
232 for(; i < CB_LISTNUM(lines); i++){
233 line = CB_LISTVAL(lines, i, NULL);
234 if(line[0] == '\t'){
235 est_doc_add_hidden_text(doc, line + 1);
236 } else {
237 est_doc_add_text(doc, line);
238 }
239 }
240 cblistclose(lines);
241 return doc;
242 }
243
244
245 /* Destroy a document object. */
246 void est_doc_delete(ESTDOC *doc){
247 assert(doc);
248 if(doc->dtexts) cblistclose(doc->dtexts);
249 if(doc->attrs) cbmapclose(doc->attrs);
250 free(doc);
251 }
252
253
254 /* Add an attribute to a document object. */
255 void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value){
256 char *rbuf, *wp;
257 assert(doc && name);
258 if(name[0] == '\0') return;
259 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
260 if(value){
261 rbuf = cbmemdup(value, -1);
262 for(wp = rbuf; *wp != '\0'; wp++){
263 if(*wp > 0 && *wp < ' ') *wp = ' ';
264 }
265 cbstrsqzspc(rbuf);
266 cbmapputvbuf(doc->attrs, name, strlen(name), rbuf, strlen(rbuf));
267 } else {
268 cbmapout(doc->attrs, name, -1);
269 }
270 }
271
272
273 /* Add a sentence of text to a document object. */
274 void est_doc_add_text(ESTDOC *doc, const char *text){
275 unsigned char *utext;
276 char *rtext, *wp;
277 int size;
278 assert(doc && text);
279 while(*text > '\0' && *text <= ' '){
280 text++;
281 }
282 if(text[0] == '\0') return;
283 if(!doc->dtexts) doc->dtexts = cblistopen();
284 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
285 est_normalize_text(utext, size, &size);
286 rtext = est_uconv_out((char *)utext, size, NULL);
287 for(wp = rtext; *wp != '\0'; wp++){
288 if(*wp > 0 && *wp < ' ') *wp = ' ';
289 }
290 cbstrsqzspc(rtext);
291 if(rtext[0] != '\0'){
292 cblistpushbuf(doc->dtexts, rtext, strlen(rtext));
293 } else {
294 free(rtext);
295 }
296 free(utext);
297 }
298
299
300 /* Add a hidden sentence to a document object. */
301 void est_doc_add_hidden_text(ESTDOC *doc, const char *text){
302 unsigned char *utext;
303 char *rtext, *wp;
304 int size;
305 assert(doc && text);
306 while(*text > '\0' && *text <= ' '){
307 text++;
308 }
309 if(text[0] == '\0') return;
310 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
311 est_normalize_text(utext, size, &size);
312 rtext = est_uconv_out((char *)utext, size, NULL);
313 for(wp = rtext; *wp != '\0'; wp++){
314 if(*wp > 0 && *wp < ' ') *wp = ' ';
315 }
316 cbstrsqzspc(rtext);
317 if(rtext[0] != '\0'){
318 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
319 if(cbmapget(doc->attrs, "", 0, NULL)) cbmapputcat(doc->attrs, "", 0, " ", 1);
320 cbmapputcat(doc->attrs, "", 0, rtext, -1);
321 }
322 free(rtext);
323 free(utext);
324 }
325
326
327 /* Get the ID number of a document object. */
328 int est_doc_id(ESTDOC *doc){
329 assert(doc);
330 return doc->id;
331 }
332
333
334 /* Get a list of attribute names of a document object. */
335 CBLIST *est_doc_attr_names(ESTDOC *doc){
336 CBLIST *names;
337 const char *kbuf;
338 int ksiz;
339 assert(doc);
340 if(!doc->attrs) return cblistopen();
341 names = cblistopen();
342 cbmapiterinit(doc->attrs);
343 while((kbuf = cbmapiternext(doc->attrs, &ksiz)) != NULL){
344 if(ksiz > 0) cblistpush(names, kbuf, ksiz);
345 }
346 cblistsort(names);
347 return names;
348 }
349
350
351 /* Get the value of an attribute of a document object. */
352 const char *est_doc_attr(ESTDOC *doc, const char *name){
353 assert(doc && name);
354 if(!doc->attrs || name[0] == '\0') return NULL;
355 return cbmapget(doc->attrs, name, -1, NULL);
356 }
357
358
359 /* Get a list of sentences of the text of a document object. */
360 const CBLIST *est_doc_texts(ESTDOC *doc){
361 assert(doc);
362 if(!doc->dtexts) doc->dtexts = cblistopen();
363 return doc->dtexts;
364 }
365
366
367 /* Concatenate sentences of the text of a document object. */
368 char *est_doc_cat_texts(ESTDOC *doc){
369 CBDATUM *datum;
370 const char *elem;
371 int i, size;
372 if(!doc->dtexts) return cbmemdup("", 0);
373 datum = cbdatumopen("", 0);
374 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
375 elem = CB_LISTVAL2(doc->dtexts, i, &size);
376 if(i > 0) cbdatumcat(datum, " ", 1);
377 cbdatumcat(datum, elem, size);
378 }
379 return cbdatumtomalloc(datum, NULL);
380 }
381
382
383 /* Dump draft data of a document object. */
384 char *est_doc_dump_draft(ESTDOC *doc){
385 CBLIST *list;
386 CBDATUM *datum;
387 const char *kbuf, *vbuf;
388 int i, ksiz, vsiz;
389 assert(doc);
390 datum = cbdatumopen("", 0);
391 if(doc->attrs){
392 list = est_doc_attr_names(doc);
393 for(i = 0; i < CB_LISTNUM(list); i++){
394 kbuf = CB_LISTVAL2(list, i, &ksiz);
395 vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
396 cbdatumcat(datum, kbuf, ksiz);
397 cbdatumcat(datum, "=", 1);
398 cbdatumcat(datum, vbuf, vsiz);
399 cbdatumcat(datum, "\n", 1);
400 }
401 cblistclose(list);
402 }
403 cbdatumcat(datum, "\n", 1);
404 if(doc->dtexts){
405 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
406 kbuf = CB_LISTVAL2(doc->dtexts, i, &ksiz);
407 cbdatumcat(datum, kbuf, ksiz);
408 cbdatumcat(datum, "\n", 1);
409 }
410 }
411 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
412 cbdatumcat(datum, "\t", 1);
413 cbdatumcat(datum, vbuf, vsiz);
414 cbdatumcat(datum, "\n", 1);
415 }
416 return cbdatumtomalloc(datum, NULL);
417 }
418
419
420 /* Make a snippet of the body text of a document object. */
421 char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
422 CBDATUM *res, *sbuf;
423 CBMAP *counts;
424 CBLIST *rwords;
425 const char *text, *word, *cval;
426 const unsigned char *rword;
427 unsigned char *rtext, *ctext;
428 int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
429 assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
430 if(!doc->dtexts) doc->dtexts = cblistopen();
431 res = cbdatumopen("", 0);
432 rwords = cblistopen();
433 for(i = 0; i < CB_LISTNUM(words); i++){
434 word = CB_LISTVAL2(words, i, &wsiz);
435 if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
436 rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
437 est_canonicalize_text(rtext, size, TRUE);
438 cblistpushbuf(rwords, (char *)rtext, size);
439 }
440 sbuf = cbdatumopen("", 0);
441 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
442 text = CB_LISTVAL2(doc->dtexts, i, &size);
443 if(i > 0) cbdatumcat(sbuf, " ", 1);
444 cbdatumcat(sbuf, text, size);
445 }
446 rtext = (unsigned char *)est_uconv_in(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf), &size);
447 ctext = (unsigned char *)cbmemdup((char *)rtext, size);
448 est_canonicalize_text(ctext, size, FALSE);
449 mywidth = hwidth;
450 if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
451 if(mywidth > wwidth) mywidth = wwidth;
452 for(i = 0; i < size && mywidth > 0; i += 2){
453 mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
454 }
455 awsiz = size - i;
456 if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
457 est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
458 wwidth -= hwidth;
459 bi = i + 2;
460 cbdatumcat(res, "\n", 1);
461 hwidth = 1000;
462 counts = cbmapopenex(ESTMINIBNUM);
463 for(i = bi; i < size && wwidth >= 0; i += 2){
464 for(j = 0; j < CB_LISTNUM(rwords); j++){
465 rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
466 if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
467 (!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
468 csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
469 cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
470 if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
471 cbmapclose(counts);
472 counts = cbmapopenex(ESTMINIBNUM);
473 }
474 mywidth = awidth / 2 + 1;
475 for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
476 mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
477 }
478 bi = k;
479 mywidth = awidth / 2 + 1;
480 for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
481 mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
482 }
483 if(k > size) k = size;
484 est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
485 wwidth -= awidth + rwsiz / 2;
486 bi = k + 2;
487 i = bi - 2;
488 cbdatumcat(res, "\n", 1);
489 break;
490 }
491 }
492 }
493 cbmapclose(counts);
494 free(ctext);
495 free(rtext);
496 cbdatumclose(sbuf);
497 cblistclose(rwords);
498 return cbdatumtomalloc(res, NULL);
499 }
500
501
502 /* Check whether the text of a document object includes every specified words. */
503 int est_doc_scan_words(ESTDOC *doc, const CBLIST *words){
504 CBLIST *rwords;
505 const unsigned char *rp, *rword;
506 const char *vbuf;
507 unsigned char *rbuf;
508 int i, j, vsiz, rsiz, rwsiz, hit;
509 assert(doc && words);
510 rwords = cblistopen();
511 for(i = 0; i < CB_LISTNUM(words); i++){
512 vbuf = CB_LISTVAL2(words, i, &vsiz);
513 if(vsiz < 1 || !strcmp(vbuf, ESTOPUVSET)) continue;
514 rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
515 est_canonicalize_text(rbuf, rsiz, TRUE);
516 cblistpushbuf(rwords, (char *)rbuf, rsiz);
517 }
518 if(doc->dtexts){
519 for(i = 0; i < CB_LISTNUM(doc->dtexts) && CB_LISTNUM(rwords) > 0; i++){
520 vbuf = CB_LISTVAL2(doc->dtexts, i, &vsiz);
521 rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
522 est_canonicalize_text(rbuf, rsiz, TRUE);
523 for(rp = rbuf; rsiz >= 0; rp += 2, rsiz -= 2){
524 for(j = 0; j < CB_LISTNUM(rwords); j++){
525 rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
526 if(est_str_fwmatch_wide(rp, rsiz, rword, rwsiz)){
527 free(cblistremove(rwords, j, NULL));
528 j--;
529 }
530 }
531 }
532 free(rbuf);
533 }
534 }
535 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
536 rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
537 est_canonicalize_text(rbuf, rsiz, TRUE);
538 for(rp = rbuf; rsiz >= 0; rp += 2, rsiz -= 2){
539 for(i = 0; i < CB_LISTNUM(rwords); i++){
540 rword = (unsigned char *)CB_LISTVAL2(rwords, i, &rwsiz);
541 if(est_str_fwmatch_wide(rp, rsiz, rword, rwsiz)){
542 free(cblistremove(rwords, i, NULL));
543 i--;
544 }
545 }
546 }
547 free(rbuf);
548 }
549 hit = CB_LISTNUM(rwords) < 1;
550 cblistclose(rwords);
551 return hit;
552 }
553
554
555
556 /*************************************************************************************************
557 * API for search conditions
558 *************************************************************************************************/
559
560
561 /* Create a condition object. */
562 ESTCOND *est_cond_new(void){
563 ESTCOND *cond;
564 CB_MALLOC(cond, sizeof(ESTCOND));
565 cond->phrase = NULL;
566 cond->gstep = 2;
567 cond->tfidf = TRUE;
568 cond->simple = FALSE;
569 cond->attrs = NULL;
570 cond->order = NULL;
571 cond->max = -1;
572 cond->scfb = FALSE;
573 cond->scores = NULL;
574 cond->snum = 0;
575 cond->opts = 0;
576 return cond;
577 }
578
579
580 /* Destroy a condition object. */
581 void est_cond_delete(ESTCOND *cond){
582 assert(cond);
583 if(cond->scores) free(cond->scores);
584 if(cond->order) free(cond->order);
585 if(cond->attrs) cblistclose(cond->attrs);
586 if(cond->phrase) free(cond->phrase);
587 free(cond);
588 }
589
590
591 /* Set a search phrase to a condition object. */
592 void est_cond_set_phrase(ESTCOND *cond, const char *phrase){
593 assert(cond && phrase);
594 if(cond->phrase) free(cond->phrase);
595 while(*phrase > '\0' && *phrase <= ' '){
596 phrase++;
597 }
598 cond->phrase = cbmemdup(phrase, -1);
599 }
600
601
602 /* Add a condition of an attribute fo a condition object. */
603 void est_cond_add_attr(ESTCOND *cond, const char *expr){
604 assert(cond && expr);
605 while(*expr > '\0' && *expr <= ' '){
606 expr++;
607 }
608 if(*expr == '\0') return;
609 if(!cond->attrs) cond->attrs = cblistopen();
610 cblistpush(cond->attrs, expr, -1);
611 }
612
613
614 /* Set the order of a condition object. */
615 void est_cond_set_order(ESTCOND *cond, const char *expr){
616 assert(cond && expr);
617 while(*expr > '\0' && *expr <= ' '){
618 expr++;
619 }
620 if(*expr == '\0') return;
621 if(cond->order) free(cond->order);
622 cond->order = cbmemdup(expr, -1);
623 }
624
625
626 /* Set the maximum number of retrieval of a condition object. */
627 void est_cond_set_max(ESTCOND *cond, int max){
628 assert(cond && max >= 0);
629 cond->max = max;
630 }
631
632
633 /* Set options of retrieval of a condition object. */
634 void est_cond_set_options(ESTCOND *cond, int options){
635 assert(cond);
636 if(options & ESTCONDSURE) cond->gstep = 1;
637 if(options & ESTCONDUSU) cond->gstep = 2;
638 if(options & ESTCONDFAST) cond->gstep = 3;
639 if(options & ESTCONDAGIT) cond->gstep = 4;
640 if(options & ESTCONDNOIDF) cond->tfidf = FALSE;
641 if(options & ESTCONDSIMPLE) cond->simple = TRUE;
642 if(options & ESTCONDSCFB) cond->scfb = TRUE;
643 cond->opts |= options;
644 }
645
646
647
648 /*************************************************************************************************
649 * API for database
650 *************************************************************************************************/
651
652
653 /* Get the string of an error code. */
654 const char *est_err_msg(int ecode){
655 switch(ecode){
656 case ESTENOERR: return "no error";
657 case ESTEINVAL: return "invalid argument";
658 case ESTEACCES: return "access forbidden";
659 case ESTELOCK: return "lock failure";
660 case ESTEDB: return "database problem";
661 case ESTEIO: return "I/O problem";
662 case ESTENOITEM: return "no such item";
663 default: break;
664 }
665 return "miscellaneous";
666 }
667
668
669 /* Open a database. */
670 ESTDB *est_db_open(const char *name, int omode, int *ecp){
671 ESTDB *db;
672 DEPOT *metadb;
673 ESTIDX *idxdb;
674 CURIA *attrdb, *textdb;
675 VILLA *fwmdb, *listdb;
676 char path[ESTPATHBUFSIZ], vbuf[ESTNUMBUFSIZ];
677 int domode, comode, vomode, idxnum, dseq, dnum, amode, vsiz;
678 assert(name && ecp);
679 *ecp = ESTENOERR;
680 if((omode & ESTDBWRITER) && (omode & ESTDBCREAT) && !est_mkdir(name)){
681 switch(errno){
682 case EACCES:
683 *ecp = ESTEACCES;
684 return NULL;
685 case EEXIST:
686 break;
687 default:
688 *ecp = ESTEIO;
689 return NULL;
690 }
691 }
692 domode = DP_OREADER;
693 comode = CR_OREADER;
694 vomode = VL_OREADER;
695 if(omode & ESTDBWRITER){
696 domode = DP_OWRITER;
697 comode = CR_OWRITER;
698 vomode = VL_OWRITER | VL_OZCOMP;
699 if(omode & ESTDBCREAT){
700 domode |= DP_OCREAT;
701 comode |= CR_OCREAT;
702 vomode |= VL_OCREAT;
703 }
704 if(omode & ESTDBTRUNC){
705 domode |= DP_OTRUNC;
706 comode |= CR_OTRUNC;
707 vomode |= VL_OTRUNC;
708 }
709 }
710 if(omode & ESTDBNOLCK){
711 domode |= DP_ONOLCK;
712 comode |= CR_ONOLCK;
713 vomode |= VL_ONOLCK;
714 }
715 if(omode & ESTDBLCKNB){
716 domode |= DP_OLCKNB;
717 comode |= CR_OLCKNB;
718 vomode |= VL_OLCKNB;
719 }
720 idxnum = 0;
721 dseq = 0;
722 dnum = 0;
723 amode = ESTAMNORMAL;
724 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
725 if((metadb = dpopen(path, domode, ESTMINIBNUM)) != NULL){
726 if((vsiz = dpgetwb(metadb, ESTKEYIDXNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
727 vbuf[vsiz] = '\0';
728 idxnum = atoi(vbuf);
729 }
730 if((vsiz = dpgetwb(metadb, ESTKEYDSEQ, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
731 vbuf[vsiz] = '\0';
732 dseq = atoi(vbuf);
733 }
734 if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
735 vbuf[vsiz] = '\0';
736 dnum = atoi(vbuf);
737 }
738 if((vsiz = dpgetwb(metadb, ESTKEYAMODE, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
739 vbuf[vsiz] = '\0';
740 amode = atoi(vbuf);
741 } else if(omode & ESTDBPERFNG){
742 amode = ESTAMPERFNG;
743 }
744 }
745 if(!metadb){
746 *ecp = (dpecode == DP_ELOCK) ? ESTELOCK : ESTEDB;
747 return NULL;
748 }
749 if(idxnum < 1) idxnum = 1;
750 if(dseq < 0) dseq = 0;
751 if(dnum < 0) dnum = 0;
752 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
753 idxdb = est_idx_open(path, vomode, idxnum);
754 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
755 fwmdb = vlopen(path, vomode, VL_CMPLEX);
756 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
757 attrdb = cropen(path, comode, ESTATTRDBBNUM, ESTATTRDBDNUM);
758 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
759 textdb = cropen(path, comode, ESTTEXTDBBNUM, ESTTEXTDBDNUM);
760 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
761 listdb = vlopen(path, vomode, VL_CMPLEX);
762 if(!metadb || !idxdb || !fwmdb || !attrdb ||!textdb || !listdb){
763 if(listdb) vlclose(listdb);
764 if(textdb) crclose(textdb);
765 if(attrdb) crclose(attrdb);
766 if(fwmdb) vlclose(fwmdb);
767 if(idxdb) est_idx_close(idxdb);
768 dpclose(metadb);
769 *ecp = ESTEDB;
770 return NULL;
771 }
772 if(omode & ESTDBWRITER){
773 crsetalign(attrdb, ESTATTRDBALN);
774 crsetalign(textdb, ESTTEXTDBALN);
775 est_idx_set_tuning(idxdb, ESTIDXDBLRM, ESTIDXDBNIM, ESTIDXDBLCN, ESTIDXDBNCN);
776 est_idx_set_current(idxdb);
777 vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
778 vlsettuning(listdb, ESTLISTDBLRM, ESTLISTDBNIM, ESTLISTDBLCN, ESTLISTDBNCN);
779 } else {
780 est_idx_set_tuning(idxdb, -1, -1, ESTIDXDBRLCN, ESTIDXDBRNCN);
781 vlsettuning(fwmdb, -1, -1, ESTFWMDBLCN, ESTFWMDBNCN);
782 vlsettuning(listdb, -1, -1, ESTLISTDBLCN, ESTLISTDBNCN);
783 }
784 CB_MALLOC(db, sizeof(ESTDB));
785 db->name = cbmemdup(name, -1);
786 db->metadb = metadb;
787 db->idxdb = idxdb;
788 db->fwmdb = fwmdb;
789 db->attrdb = attrdb;
790 db->textdb = textdb;
791 db->listdb = listdb;
792 db->ecode = ESTENOERR;
793 db->fatal = FALSE;
794 db->dseq = dseq;
795 db->dnum = dnum;
796 db->amode = amode;
797 if(omode & ESTDBWRITER){
798 db->idxcc = cbmapopenex(ESTIDXCCBNUM);
799 db->icsiz = 0;
800 db->icmax = ESTIDXCCMAX;
801 db->outcc = cbmapopenex(ESTOUTCCBNUM);
802 } else {
803 db->idxcc = cbmapopenex(1);
804 db->icsiz = 0;
805 db->icmax = 0;
806 db->outcc = cbmapopenex(1);
807 }
808 db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
809 db->kcmnum = ESTKEYCCMNUM;
810 db->attrcc = cbmapopenex(ESTATTRCCMNUM + 1);
811 db->acmnum = ESTATTRCCMNUM;
812 db->textcc = cbmapopenex(ESTTEXTCCMNUM + 1);
813 db->tcmnum = ESTTEXTCCMNUM;
814 db->spacc = NULL;
815 db->scmnum = 0;
816 db->scname = NULL;
817 db->cbinfo = NULL;
818 db->cbvec = NULL;
819 db->vecdata = NULL;
820 db->metacc = NULL;
821 return db;
822 }
823
824
825 /* Close a database. */
826 int est_db_close(ESTDB *db, int *ecp){
827 int err;
828 assert(db && ecp);
829 *ecp = ESTENOERR;
830 err = FALSE;
831 if(dpwritable(db->metadb)){
832 if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
833 }
834 est_db_inform(db, "closing");
835 if(db->metacc) cbmapclose(db->metacc);
836 if(db->spacc){
837 free(db->scname);
838 cbmapclose(db->spacc);
839 }
840 cbmapclose(db->textcc);
841 cbmapclose(db->attrcc);
842 cbmapclose(db->keycc);
843 cbmapclose(db->outcc);
844 cbmapclose(db->idxcc);
845 if(!vlclose(db->listdb)) err = TRUE;
846 if(!crclose(db->textdb)) err = TRUE;
847 if(!crclose(db->attrdb)) err = TRUE;
848 if(!vlclose(db->fwmdb)) err = TRUE;
849 if(!est_idx_close(db->idxdb)) err = TRUE;
850 if(!dpclose(db->metadb)) err = TRUE;
851 free(db->name);
852 if(db->fatal){
853 *ecp = db->ecode;
854 err = TRUE;
855 } else if(err){
856 *ecp = ESTEDB;
857 }
858 free(db);
859 return err ? FALSE : TRUE;
860 }
861
862
863 /* Get the last happended error code of a database. */
864 int est_db_error(ESTDB *db){
865 assert(db);
866 return db->ecode;
867 }
868
869
870 /* Check whether a database has a fatal error. */
871 int est_db_fatal(ESTDB *db){
872 assert(db);
873 return db->fatal;
874 }
875
876
877 /* Flush index words in the cache of a database. */
878 int est_db_flush(ESTDB *db, int max){
879 CBMAP *ids;
880 CBLIST *keys;
881 CBDATUM *nval;
882 const char *kbuf, *vbuf, *rp, *pv;
883 char *tbuf;
884 int i, err, ksiz, vsiz, rnum, id, tsiz;
885 assert(db);
886 if(!dpwritable(db->metadb)){
887 db->ecode = ESTEACCES;
888 return FALSE;
889 }
890 if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->outcc) < 1) return TRUE;
891 err = FALSE;
892 keys = cblistopen();
893 cbmapiterinit(db->idxcc);
894 while((kbuf = cbmapiternext(db->idxcc, &ksiz)) != NULL){
895 cblistpush(keys, kbuf, ksiz);
896 }
897 rnum = CB_LISTNUM(keys);
898 cblistsort(keys);
899 if(max > 0){
900 while(CB_LISTNUM(keys) > max){
901 free(cblistpop(keys, NULL));
902 }
903 }
904 for(i = 0; i < CB_LISTNUM(keys); i++){
905 kbuf = CB_LISTVAL2(keys, i, &ksiz);
906 vbuf = cbmapget(db->idxcc, kbuf, ksiz, &vsiz);
907 if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz)){
908 err = TRUE;
909 break;
910 }
911 cbmapout(db->idxcc, kbuf, ksiz);
912 db->icsiz -= vsiz;
913 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing index words");
914 }
915 for(i = 0; i < CB_LISTNUM(keys); i++){
916 kbuf = CB_LISTVAL2(keys, i, &ksiz);
917 if(!vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP) && dpecode != DP_EKEEP){
918 err = TRUE;
919 break;
920 }
921 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing fwm keys");
922 }
923 cblistclose(keys);
924 if(cbmaprnum(db->idxcc) < 1){
925 cbmapclose(db->idxcc);
926 db->idxcc = cbmapopenex(rnum > ESTIDXCCBNUM ? rnum * 1.5 : ESTIDXCCBNUM);
927 }
928 if(max < 0 && cbmaprnum(db->outcc) > 0){
929 ids = cbmapopen();
930 keys = cblistopen();
931 cbmapiterinit(db->outcc);
932 while((kbuf = cbmapiternext(db->outcc, &ksiz)) != NULL){
933 if(*kbuf == '\t'){
934 id = atoi(kbuf + 1);
935 cbmapput(ids, (char *)&id, sizeof(int), "", 0, FALSE);
936 } else {
937 cblistpush(keys, kbuf, ksiz);
938 }
939 }
940 cblistsort(keys);
941 for(i = 0; i < CB_LISTNUM(keys); i++){
942 if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
943 kbuf = CB_LISTVAL2(keys, i, &ksiz);
944 if((tbuf = est_idx_get(db->idxdb, kbuf, ksiz, &tsiz)) != NULL){
945 nval = cbdatumopen("", 0);
946 rp = tbuf;
947 while(rp < tbuf + tsiz){
948 pv = rp;
949 rp += 5;
950 while(*rp != 0x0){
951 rp += 2;
952 }
953 rp++;
954 if(!cbmapget(ids, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
955 }
956 if(!est_idx_out(db->idxdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
957 if(CB_DATUMSIZE(nval) > 0){
958 if(!est_idx_add(db->idxdb, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
959 err = TRUE;
960 } else {
961 if(!vlout(db->fwmdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
962 }
963 cbdatumclose(nval);
964 free(tbuf);
965 }
966 cbmapout(db->outcc, kbuf, ksiz);
967 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
968 }
969 rnum = cbmaprnum(ids);
970 cblistclose(keys);
971 cbmapclose(ids);
972 cbmapclose(db->outcc);
973 db->outcc = cbmapopenex(ESTOUTCCBNUM);
974 }
975 cbmapclose(db->keycc);
976 db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
977 db->kcmnum = ESTKEYCCMNUM;
978 if(err){
979 db->ecode = ESTEDB;
980 db->fatal = TRUE;
981 return FALSE;
982 }
983 return TRUE;
984 }
985
986
987 /* Synchronize updating contents of a database. */
988 int est_db_sync(ESTDB *db){
989 int err;
990 assert(db);
991 if(!dpwritable(db->metadb)){
992 db->ecode = ESTEACCES;
993 return FALSE;
994 }
995 err = FALSE;
996 if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
997 est_db_inform(db, "synchronizing the database for meta information");
998 if(!dpsync(db->metadb)) err = TRUE;
999 est_db_inform(db, "synchronizing the inverted index");
1000 if(!est_idx_sync(db->idxdb)) err = TRUE;
1001 est_db_inform(db, "synchronizing the database for forward matching");
1002 if(!vlsync(db->fwmdb)) err = TRUE;
1003 est_db_inform(db, "synchronizing the database for attrutes");
1004 if(!crsync(db->attrdb)) err = TRUE;
1005 est_db_inform(db, "synchronizing the database for texts");
1006 if(!crsync(db->textdb)) err = TRUE;
1007 est_db_inform(db, "synchronizing the database for document list");
1008 if(!vlsync(db->listdb)) err = TRUE;
1009 if(err){
1010 db->ecode = ESTEDB;
1011 db->fatal = TRUE;
1012 }
1013 return err ? FALSE : TRUE;
1014 }
1015
1016
1017 /* Optimize a database. */
1018 int est_db_optimize(ESTDB *db, int options){
1019 CBMAP *dmap;
1020 CBLIST *words;
1021 CBDATUM *nval;
1022 const char *word, *rp, *pv;
1023 char *kbuf, *vbuf;
1024 int i, err, id, ksiz, vsiz, wsiz;
1025 assert(db);
1026 if(!dpwritable(db->metadb)){
1027 db->ecode = ESTEACCES;
1028 return FALSE;
1029 }
1030 if(!est_db_flush(db, -1)) return FALSE;
1031 err = FALSE;
1032 if(!(options & ESTOPTNOPURGE)){
1033 dmap = cbmapopenex(vlrnum(db->listdb) + 1);
1034 vlcurfirst(db->listdb);
1035 while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
1036 id = atoi(vbuf);
1037 cbmapput(dmap, (char *)&id, sizeof(int), "", 0, FALSE);
1038 free(vbuf);
1039 vlcurnext(db->listdb);
1040 }
1041 words = cblistopen();
1042 vlcurfirst(db->fwmdb);
1043 while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
1044 cblistpushbuf(words, kbuf, ksiz);
1045 vlcurnext(db->fwmdb);
1046 }
1047 for(i = 0; i < CB_LISTNUM(words); i++){
1048 if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
1049 word = CB_LISTVAL2(words, i, &wsiz);
1050 if((vbuf = est_idx_get(db->idxdb, word, wsiz, &vsiz)) != NULL){
1051 nval = cbdatumopen("", 0);
1052 rp = vbuf;
1053 while(rp < vbuf + vsiz){
1054 pv = rp;
1055 rp += 5;
1056 while(*rp != 0x0){
1057 rp += 2;
1058 }
1059 rp++;
1060 if(cbmapget(dmap, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
1061 }
1062 if(!est_idx_out(db->idxdb, word, wsiz)) err = TRUE;
1063 if(CB_DATUMSIZE(nval) > 0){
1064 if(!est_idx_add(db->idxdb, word, wsiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1065 err = TRUE;
1066 } else {
1067 if(!vlout(db->fwmdb, word, wsiz)) err = TRUE;
1068 }
1069 cbdatumclose(nval);
1070 free(vbuf);
1071 } else {
1072 err = TRUE;
1073 }
1074 free(kbuf);
1075 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1076 }
1077 cblistclose(words);
1078 cbmapclose(dmap);
1079 }
1080 if(!(options & ESTOPTNODBOPT)){
1081 est_db_inform(db, "optimizing the inverted index");
1082 if(!est_idx_optimize(db->idxdb)) err = TRUE;
1083 est_db_inform(db, "optimizing the database for forward matching");
1084 if(!vloptimize(db->fwmdb)) err = TRUE;
1085 est_db_inform(db, "optimizing the database for attrutes");
1086 if(!croptimize(db->attrdb, -1)) err = TRUE;
1087 est_db_inform(db, "optimizing the database for texts");
1088 if(!croptimize(db->textdb, -1)) err = TRUE;
1089 est_db_inform(db, "optimizing the database for document list");
1090 if(!vloptimize(db->listdb)) err = TRUE;
1091 }
1092 if(err){
1093 db->ecode = ESTEDB;
1094 db->fatal = TRUE;
1095 }
1096 return err ? FALSE : TRUE;
1097 }
1098
1099
1100 /* Add a document to a database. */
1101 int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options){
1102 CBMAP *ocmap, *fmap, *qmap;
1103 CBLIST *words;
1104 CBDATUM *ocbuf;
1105 const char *uri, *text, *word, *fnext, *snext, *kbuf, *vbuf;
1106 unsigned char junc[2], c;
1107 char wbuf[ESTWORDMAXLEN+3], *sbuf, *zbuf, nbuf[ESTNUMBUFSIZ];
1108 int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, num, ksiz, vsiz, ssiz, zsiz;
1109 double tune;
1110 assert(db && doc);
1111 if(!dpwritable(db->metadb)){
1112 db->ecode = ESTEACCES;
1113 return FALSE;
1114 }
1115 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
1116 db->ecode = ESTEINVAL;
1117 return FALSE;
1118 }
1119 if((id = est_db_uri_to_id(db, uri)) > 0 &&
1120 !est_db_out_doc(db, id, (options & ESTPDCLEAN) ? ESTODCLEAN : 0)) return FALSE;
1121 if(!doc->dtexts) doc->dtexts = cblistopen();
1122 doc->id = ++(db->dseq);
1123 sprintf(nbuf, "%d", doc->id);
1124 cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
1125 ocmap = cbmapopen();
1126 fmap = cbmapopen();
1127 qmap = cbmapopen();
1128 wnum = 0;
1129 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1130 if(i < 0){
1131 if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1132 } else {
1133 text = CB_LISTVAL(doc->dtexts, i, NULL);
1134 }
1135 words = cblistopen();
1136 switch(db->amode){
1137 case ESTAMPERFNG:
1138 est_break_text_perfng(text, words, FALSE, TRUE);
1139 break;
1140 default:
1141 est_break_text(text, words, FALSE, TRUE);
1142 break;
1143 }
1144 wnum += CB_LISTNUM(words);
1145 for(j = 0; j < CB_LISTNUM(words); j++){
1146 word = CB_LISTVAL2(words, j, &wsiz);
1147 if(wsiz > ESTWORDMAXLEN) continue;
1148 fnext = cblistval(words, j + 1, &fnsiz);
1149 snext = cblistval(words, j + 2, &snsiz);
1150 junc[0] = fnext ? dpinnerhash(fnext, fnsiz) % ESTJHASHNUM + 1: 0xff;
1151 junc[1] = snext ? dpouterhash(snext, snsiz) % ESTJHASHNUM + 1: 0xff;
1152 memcpy(wbuf, word, wsiz);
1153 memcpy(wbuf + wsiz, "\t", 1);
1154 memcpy(wbuf + wsiz + 1, junc, 2);
1155 np = (int *)cbmapget(fmap, word, wsiz, NULL);
1156 num = np ? *(int *)np : 0;
1157 num += ESTOCPOINT;
1158 cbmapput(fmap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1159 if(cbmapput(qmap, wbuf, wsiz + 3, "", 0, FALSE))
1160 cbmapputcat(ocmap, word, wsiz, (char *)junc, 2);
1161 }
1162 cblistclose(words);
1163 }
1164 cbmapiterinit(ocmap);
1165 while((kbuf = cbmapiternext(ocmap, &ksiz)) != NULL){
1166 vbuf = cbmapget(ocmap, kbuf, ksiz, &vsiz);
1167 ocbuf = cbdatumopen("", 0);
1168 cbdatumcat(ocbuf, (char *)&(doc->id), sizeof(int));
1169 num = *(int *)cbmapget(fmap, kbuf, ksiz, NULL);
1170 tune = log(wnum + 3);
1171 tune = (tune * tune) / 10.0;
1172 num /= tune > 4.0 ? tune : 4.0;
1173 if(num >= 0x80) num += (0x80 - num) * 0.75;
1174 if(num >= 0xc0) num += (0xc0 - num) * 0.75;
1175 c = num < 0xff ? num : 0xff;
1176 cbdatumcat(ocbuf, (char *)&c, 1);
1177 cbdatumcat(ocbuf, vbuf, vsiz);
1178 c = 0x00;
1179 cbdatumcat(ocbuf, (char *)&c, 1);
1180 cbmapputcat(db->idxcc, kbuf, ksiz, CB_DATUMPTR(ocbuf), CB_DATUMSIZE(ocbuf));
1181 db->icsiz += CB_DATUMSIZE(ocbuf);
1182 cbdatumclose(ocbuf);
1183 }
1184 cbmapclose(qmap);
1185 cbmapclose(fmap);
1186 cbmapclose(ocmap);
1187 err = FALSE;
1188 sbuf = cbmapdump(doc->attrs, &ssiz);
1189 if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DKEEP)){
1190 db->ecode = ESTEDB;
1191 db->fatal = TRUE;
1192 err = TRUE;
1193 }
1194 free(sbuf);
1195 sbuf = cblistdump(doc->dtexts, &ssiz);
1196 if(!(zbuf = est_deflate(sbuf, ssiz, &zsiz))){
1197 CB_MALLOC(zbuf, 1);
1198 zsiz = 0;
1199 db->ecode = ESTEMISC;
1200 db->fatal = TRUE;
1201 err = TRUE;
1202 }
1203 if(!crput(db->textdb, (char *)&(doc->id), sizeof(int), zbuf, zsiz, CR_DKEEP)){
1204 db->ecode = ESTEDB;
1205 db->fatal = TRUE;
1206 err = TRUE;
1207 }
1208 free(sbuf);
1209 free(zbuf);
1210 sprintf(nbuf, "%d", doc->id);
1211 if(!vlput(db->listdb, uri, -1, nbuf, -1, VL_DKEEP)){
1212 db->ecode = ESTEDB;
1213 db->fatal = TRUE;
1214 err = TRUE;
1215 }
1216 db->dnum++;
1217 if(est_db_used_cache_size(db) > db->icmax){
1218 if(!est_db_flush(db, -1)) err = TRUE;
1219 est_idx_increment(db->idxdb);
1220 }
1221 return err ? FALSE : TRUE;
1222 }
1223
1224
1225 /* Remove a document from a database. */
1226 int est_db_out_doc(ESTDB *db, int id, int options){
1227 ESTDOC *doc;
1228 CBLIST *words;
1229 const char *uri, *text, *word;
1230 char numbuf[ESTNUMBUFSIZ];
1231 int i, j, len, wsiz;
1232 assert(db && id > 0);
1233 if(!dpwritable(db->metadb)){
1234 db->ecode = ESTEACCES;
1235 return FALSE;
1236 }
1237 if(!(doc = est_db_get_doc(db, id, 0))) return FALSE;
1238 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
1239 est_doc_delete(doc);
1240 db->ecode = ESTEDB;
1241 db->fatal = TRUE;
1242 return FALSE;
1243 }
1244 if(!crout(db->attrdb, (char *)&id, sizeof(int)) ||
1245 !crout(db->textdb, (char *)&id, sizeof(int)) || !vlout(db->listdb, uri, -1)){
1246 est_doc_delete(doc);
1247 db->ecode = ESTEDB;
1248 db->fatal = TRUE;
1249 return FALSE;
1250 }
1251 cbmapout(db->attrcc, (char *)&id, sizeof(int));
1252 cbmapout(db->textcc, (char *)&id, sizeof(int));
1253 if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
1254 if((options & ESTODCLEAN) && doc->dtexts){
1255 len = sprintf(numbuf, "\t%d", doc->id);
1256 cbmapput(db->outcc, numbuf, len, "", 0, FALSE);
1257 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1258 if(i < 0){
1259 if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1260 } else {
1261 text = CB_LISTVAL(doc->dtexts, i, NULL);
1262 }
1263 words = cblistopen();
1264 switch(db->amode){
1265 case ESTAMPERFNG:
1266 est_break_text_perfng(text, words, FALSE, TRUE);
1267 break;
1268 default:
1269 est_break_text(text, words, FALSE, TRUE);
1270 break;
1271 }
1272 for(j = 0; j < CB_LISTNUM(words); j++){
1273 word = CB_LISTVAL2(words, j, &wsiz);
1274 cbmapput(db->outcc, word, wsiz, "", 0, FALSE);
1275 }
1276 cblistclose(words);
1277 }
1278 }
1279 est_doc_delete(doc);
1280 db->dnum--;
1281 return TRUE;
1282 }
1283
1284
1285 /* Retrieve a document in a database. */
1286 ESTDOC *est_db_get_doc(ESTDB *db, int id, int options){
1287 ESTDOC *doc;
1288 const char *cbuf;
1289 char *vbuf, *zbuf;
1290 int i, csiz, vsiz, zsiz, num;
1291 assert(db && id > 0);
1292 cbuf = NULL;
1293 if(options & ESTGDNOATTR){
1294 if(!crvsiz(db->attrdb, (char *)&id, sizeof(int))){
1295 if(dpecode == DP_ENOITEM){
1296 db->ecode = ESTENOITEM;
1297 return NULL;
1298 } else {
1299 db->ecode = ESTEDB;
1300 db->fatal = TRUE;
1301 return NULL;
1302 }
1303 }
1304 vbuf = NULL;
1305 } else if((cbuf = cbmapget(db->attrcc, (char *)&id, sizeof(int), &csiz)) != NULL){
1306 cbmapmove(db->attrcc, (char *)&id, sizeof(int), FALSE);
1307 vbuf = NULL;
1308 } else if(!(vbuf = crget(db->attrdb, (char *)&id, sizeof(int), 0, -1, &vsiz))){
1309 if(dpecode == DP_ENOITEM){
1310 db->ecode = ESTENOITEM;
1311 return NULL;
1312 } else {
1313 db->ecode = ESTEDB;
1314 db->fatal = TRUE;
1315 return NULL;
1316 }
1317 }
1318 doc = est_doc_new();
1319 doc->id = id;
1320 if(cbuf){
1321 doc->attrs = cbmapload(cbuf, csiz);
1322 } else if(vbuf){
1323 doc->attrs = cbmapload(vbuf, vsiz);
1324 cbmapputvbuf(db->attrcc, (char *)&id, sizeof(int), vbuf, vsiz);
1325 if(cbmaprnum(db->attrcc) > db->acmnum){
1326 num = cbmaprnum(db->attrcc) * 0.1 + 1;
1327 cbmapiterinit(db->attrcc);
1328 for(i = 0; i < num && (cbuf = cbmapiternext(db->attrcc, NULL)) != NULL; i++){
1329 cbmapout(db->attrcc, cbuf, sizeof(int));
1330 }
1331 }
1332 } else {
1333 doc->attrs = NULL;
1334 }
1335 if(!(options & ESTGDNOTEXT)){
1336 if((cbuf = cbmapget(db->textcc, (char *)&id, sizeof(int), &csiz)) != NULL){
1337 cbmapmove(db->textcc, (char *)&id, sizeof(int), FALSE);
1338 doc->dtexts = cblistload(cbuf, csiz);
1339 } else {
1340 if(!(zbuf = crget(db->textdb, (char *)&id, sizeof(int), 0, -1, &zsiz))){
1341 db->ecode = ESTEDB;
1342 db->fatal = TRUE;
1343 est_doc_delete(doc);
1344 return NULL;
1345 }
1346 if(!(vbuf = est_inflate(zbuf, zsiz, &vsiz))){
1347 db->ecode = ESTEDB;
1348 db->fatal = TRUE;
1349 free(zbuf);
1350 est_doc_delete(doc);
1351 return NULL;
1352 }
1353 doc->dtexts = cblistload(vbuf, vsiz);
1354 cbmapputvbuf(db->textcc, (char *)&id, sizeof(int), vbuf, vsiz);
1355 if(cbmaprnum(db->textcc) > db->tcmnum){
1356 num = cbmaprnum(db->textcc) * 0.1 + 1;
1357 cbmapiterinit(db->textcc);
1358 for(i = 0; i < num &&(cbuf = cbmapiternext(db->textcc, NULL)) != NULL; i++){
1359 cbmapout(db->textcc, cbuf, sizeof(int));
1360 }
1361 }
1362 free(zbuf);
1363 }
1364 }
1365 return doc;
1366 }
1367
1368
1369 /* Retrieve the value of an attribute of a document in a database. */
1370 char *est_db_get_doc_attr(ESTDB *db, int id, const char *name){
1371 const char *cbuf;
1372 char *mbuf, *vbuf;
1373 int cb, csiz, msiz, vsiz;
1374 assert(db && id > 0 && name);
1375 cb = db->spacc && !strcmp(name, db->scname);
1376 if(cb && (cbuf = cbmapget(db->spacc, (char *)&id, sizeof(int), &csiz)) != NULL){
1377 cbmapmove(db->spacc, (char *)&id, sizeof(int), FALSE);
1378 return cbmemdup(cbuf, csiz);
1379 }
1380 if(!(mbuf = crget(db->attrdb, (char *)&id, sizeof(int), 0, -1, &msiz))){
1381 db->ecode = dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB;
1382 return NULL;
1383 }
1384 if(!(vbuf = cbmaploadone(mbuf, msiz, name, -1, &vsiz))){
1385 db->ecode = ESTENOITEM;
1386 free(mbuf);
1387 return NULL;
1388 }
1389 if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
1390 free(mbuf);
1391 return vbuf;
1392 }
1393
1394
1395 /* Get the ID of a document spacified by URI. */
1396 int est_db_uri_to_id(ESTDB *db, const char *uri){
1397 char *vbuf;
1398 int id;
1399 assert(db && uri);
1400 if(!(vbuf = vlget(db->listdb, uri, -1, NULL))){
1401 db->ecode = ESTENOITEM;
1402 return -1;
1403 }
1404 id = atoi(vbuf);
1405 free(vbuf);
1406 return id;
1407 }
1408
1409
1410 /* Extract keywords of a document object. */
1411 CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max){
1412 ESTKEYSC *scores;
1413 CBMAP *keys, *umap;
1414 CBLIST *words;
1415 const char *text, *word, *vbuf;
1416 char numbuf[ESTNUMBUFSIZ];
1417 int i, wsiz, num, smax, snum, vsiz;
1418 assert(doc && max >= 0);
1419 if(!doc->dtexts) return cbmapopenex(1);
1420 keys = cbmapopenex(max * 1.5);
1421 words = cblistopen();
1422 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1423 if(i < 0){
1424 if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1425 } else {
1426 text = CB_LISTVAL(doc->dtexts, i, NULL);
1427 }
1428 if(db){
1429 switch(db->amode){
1430 case ESTAMPERFNG:
1431 est_break_text_perfng(text, words, FALSE, TRUE);
1432 break;
1433 default:
1434 est_break_text(text, words, FALSE, TRUE);
1435 break;
1436 }
1437 } else {
1438 est_break_text(text, words, FALSE, TRUE);
1439 }
1440 }
1441 umap = cbmapopenex(CB_LISTNUM(words) + 1);
1442 for(i = 0; i < CB_LISTNUM(words); i++){
1443 word = CB_LISTVAL2(words, i, &wsiz);
1444 if(wsiz > ESTWORDMAXLEN) continue;
1445 num = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf + 1 : 1;
1446 cbmapput(umap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1447 }
1448 smax = max * (db ? ESTKEYSCALW : 1) + 1;
1449 CB_MALLOC(scores, cbmaprnum(umap) * sizeof(ESTKEYSC) + 1);
1450 snum = 0;
1451 cbmapiterinit(umap);
1452 for(i = 0; i < smax && (word = cbmapiternext(umap, &wsiz)) != NULL; i++){
1453 scores[snum].word = word;
1454 scores[snum].wsiz = wsiz;
1455 scores[snum].pt = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf : 0;
1456 snum++;
1457 }
1458 qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
1459 if(db){
1460 for(i = 0; i < snum; i++){
1461 if((vbuf = cbmapget(db->keycc, scores[i].word, scores[i].wsiz, NULL)) != NULL){
1462 cbmapmove(db->keycc, scores[i].word, scores[i].wsiz, FALSE);
1463 vsiz = *(int*)vbuf;
1464 } else {
1465 vsiz = est_idx_vsiz(db->idxdb, scores[i].word, scores[i].wsiz);
1466 cbmapput(db->keycc, scores[i].word, scores[i].wsiz, (char *)&vsiz, sizeof(int), FALSE);
1467 }
1468 scores[i].pt *= 400000.0 / (vsiz + 64);
1469 }
1470 if(db->kcmnum >= 0 && cbmaprnum(db->keycc) > db->kcmnum){
1471 num = db->kcmnum * 0.1 + 1;
1472 cbmapiterinit(db->keycc);
1473 for(i = 0; i < num && (word = cbmapiternext(db->keycc, &wsiz)) != NULL; i++){
1474 cbmapout(db->keycc, word, wsiz);
1475 }
1476 }
1477 qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
1478 }
1479 for(i = 0; i < snum && i < max; i++){
1480 vsiz = sprintf(numbuf, "%d", scores[i].pt);
1481 cbmapput(keys, scores[i].word, scores[i].wsiz, numbuf, vsiz, FALSE);
1482 }
1483 free(scores);
1484 cbmapclose(umap);
1485 cblistclose(words);
1486 return keys;
1487 }
1488
1489
1490 /* Initialize the iterator of a database. */
1491 int est_db_iter_init(ESTDB *db){
1492 assert(db);
1493 return vlcurfirst(db->listdb);
1494 }
1495
1496
1497 /* Get the next ID of the iterator of a database. */
1498 int est_db_iter_next(ESTDB *db){
1499 char *vbuf;
1500 int id;
1501 assert(db);
1502 if(!(vbuf = vlcurval(db->listdb, NULL))){
1503 if(dpecode == DP_ENOITEM){
1504 db->ecode = ESTENOITEM;
1505 return 0;
1506 } else {
1507 db->ecode = ESTEDB;
1508 db->fatal = TRUE;
1509 return -1;
1510 }
1511 }
1512 id = atoi(vbuf);
1513 free(vbuf);
1514 vlcurnext(db->listdb);
1515 return id;
1516 }
1517
1518
1519 /* Get the name of a database. */
1520 const char *est_db_name(ESTDB *db){
1521 assert(db);
1522 return db->name;
1523 }
1524
1525
1526 /* Get the number of documents in a database. */
1527 int est_db_doc_num(ESTDB *db){
1528 assert(db);
1529 return db->dnum;
1530 }
1531
1532
1533 /* Get the number of words in a database. */
1534 int est_db_word_num(ESTDB *db){
1535 assert(db);
1536 return vlrnum(db->fwmdb);
1537 }
1538
1539
1540 /* Get the size of a database. */
1541 double est_db_size(ESTDB *db){
1542 assert(db);
1543 return dpfsiz(db->metadb) + est_idx_size(db->idxdb) + vlfsiz(db->fwmdb) +
1544 crfsizd(db->attrdb) + crfsizd(db->textdb) + vlfsiz(db->listdb);
1545 }
1546
1547
1548 /* Search documents corresponding a condition for a database. */
1549 int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints){
1550 ESTSCORE *scores, *tscores;
1551 CBMAP *svmap;
1552 CBLIST *terms;
1553 const char *term, *rp;
1554 char *tmp, numbuf[ESTNUMBUFSIZ];
1555 int i, j, snum, pcnum, ncnum, tsnum, add, nnum, id, score, hnum, len, *rval;
1556 double tune;
1557 assert(db && cond && nump);
1558 scores = NULL;
1559 snum = 0;
1560 if(cond->phrase && cond->phrase[0] == ESTOPSIMILAR[0] &&
1561 cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
1562 rp = cond->phrase;
1563 rp += strlen(ESTOPSIMILAR);
1564 while(*rp > '\0' && *rp <= ' '){
1565 rp++;
1566 }
1567 svmap = est_phrase_vector(rp);
1568 scores = est_search_similar(db, svmap, &snum, ESTSMLRKNUM, ESTSMLRUNUM, cond->tfidf,
1569 cond->order ? ESTSMLRNMIN : 0.0);
1570 cbmapclose(svmap);
1571 } else if(cond->phrase){
1572 if(cond->simple){
1573 tmp = est_phrase_from_thumb(cond->phrase);
1574 terms = est_phrase_terms(tmp);
1575 free(tmp);
1576 } else {
1577 terms = est_phrase_terms(cond->phrase);
1578 }
1579 pcnum = 0;
1580 ncnum = 0;
1581 add = TRUE;
1582 for(i = 0; i < CB_LISTNUM(terms); i++){
1583 term = CB_LISTVAL(terms, i, NULL);
1584 if(!strcmp(term, ESTOPISECT)){
1585 add = TRUE;
1586 } else if(!strcmp(term, ESTOPDIFF)){
1587 add = FALSE;
1588 } else {
1589 if(!strcmp(term, ESTOPUVSET)){
1590 tscores = est_search_uvset(db, &tsnum, hints, add);
1591 } else {
1592 tscores = est_search_union(db, term, cond->gstep, &tsnum, hints, add);
1593 }
1594 if(add){
1595 if(cond->tfidf){
1596 tune = log(tsnum + 3);
1597 tune = tune * tune * tune;
1598 if(tune < 8.0) tune = 8.0;
1599 for(j = 0; j < tsnum; j++){
1600 tscores[j].score *= 10000 / tune;
1601 }
1602 }
1603 pcnum++;
1604 } else {
1605 ncnum++;
1606 }
1607 if(scores){
1608 CB_REALLOC(scores, (snum + tsnum) * sizeof(ESTSCORE) + 1);
1609 for(j = 0; j < tsnum; j++){
1610 scores[snum+j].id = tscores[j].id;
1611 scores[snum+j].score = add ? tscores[j].score : -1;
1612 }
1613 snum += tsnum;
1614 free(tscores);
1615 } else {
1616 scores = tscores;
1617 snum = tsnum;
1618 }
1619 }
1620 }
1621 if(scores){
1622 if(pcnum > 1 || ncnum > 0){
1623 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
1624 nnum = 0;
1625 for(i = 0; i < snum; i++){
1626 id = scores[i].id;
1627 score = scores[i].score;
1628 hnum = score >= 0 ? 1 : 0;
1629 for(j = i + 1; j < snum && scores[j].id == id; j++){
1630 if(score >= 0 && scores[j].score >= 0){
1631 score += scores[j].score;
1632 hnum++;
1633 } else {
1634 score = -1;
1635 }
1636 }
1637 if(score >= 0 && hnum >= pcnum){
1638 scores[nnum].id = id;
1639 scores[nnum].score = score;
1640 nnum++;
1641 }
1642 i = j - 1;
1643 }
1644 snum = nnum;
1645 }
1646 } else {
1647 CB_MALLOC(scores, 1);
1648 snum = 0;
1649 }
1650 cblistclose(terms);
1651 } else if(cond->attrs){
1652 scores = est_search_uvset(db, &snum, hints, TRUE);
1653 } else {
1654 CB_MALLOC(scores, 1);
1655 snum = 0;
1656 }
1657 if(cbmaprnum(db->outcc) > 0){
1658 tsnum = 0;
1659 for(i = 0; i < snum; i++){
1660 len = sprintf(numbuf, "\t%d", scores[i].id);
1661 if(cbmapget(db->outcc, numbuf, len, NULL)) continue;
1662 scores[tsnum++] = scores[i];
1663 }
1664 snum = tsnum;
1665 }
1666 if(cond->attrs || cond->order)
1667 snum = est_narrow_scores(db, cond->attrs, cond->order, scores, snum);
1668 if(!cond->order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score);
1669 if(hints){
1670 sprintf(numbuf, "%d", snum);
1671 cbmapput(hints, "", 0, numbuf, -1, FALSE);
1672 }
1673 if(cond->max >= 0 && cond->max < snum) snum = cond->max;
1674 CB_MALLOC(rval, snum * sizeof(int) + 1);
1675 for(i = 0; i < snum; i++){
1676 rval[i] = scores[i].id;
1677 }
1678 if(cond->scfb){
1679 CB_REALLOC(cond->scores, snum * sizeof(int) + 1);
1680 for(i = 0; i < snum; i++){
1681 cond->scores[i] = scores[i].score;
1682 }
1683 cond->snum = snum;
1684 }
1685 *nump = snum;
1686 if(*nump < 1) db->ecode = ESTENOITEM;
1687 free(scores);
1688 return rval;
1689 }
1690
1691
1692 /* Set the maximum size of the cache memory of a database. */
1693 void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum){
1694 assert(db);
1695 if(dpwritable(db->metadb) && size > 0) db->icmax = size;
1696 if(anum > 0) db->acmnum = anum;
1697 if(tnum > 0) db->tcmnum = tnum;
1698 }
1699
1700
1701 /* Set the special cache for narrowing and sorting with document attributes. */
1702 void est_db_set_special_cache(ESTDB *db, const char *name, int num){
1703 assert(db && name && num >= 0);
1704 if(db->spacc){
1705 free(db->scname);
1706 cbmapclose(db->spacc);
1707 }
1708 db->spacc = cbmapopenex(num + 1);
1709 db->scmnum = num;
1710 db->scname = cbmemdup(name, -1);
1711 }
1712
1713
1714
1715 /*************************************************************************************************
1716 * features for experts
1717 *************************************************************************************************/
1718
1719
1720 /* Handle to the file of random number generator. */
1721 FILE *est_random_ifp = NULL;
1722
1723
1724 /* Break a sentence of text and extract words. */
1725 void est_break_text(const char *text, CBLIST *list, int norm, int tail){
1726 CBLIST *words;
1727 const unsigned char *word, *next;
1728 unsigned char *utext;
1729 char *tmp;
1730 int i, j, k, size, cc, wsiz, nsiz, tsiz;
1731 assert(text);
1732 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
1733 if(norm) est_normalize_text(utext, size, &size);
1734 est_canonicalize_text(utext, size, FALSE);
1735 words = cblistopen();
1736 for(i = 0; i < size; i += 2){
1737 cc = est_char_category(utext[i] * 0x100 + utext[i+1]);
1738 for(j = i + 2; j < size; j += 2){
1739 if(est_char_category(utext[j] * 0x100 + utext[j+1]) != cc) break;
1740 }
1741 switch(cc){
1742 case ESTDELIMCHR:
1743 case ESTWESTALPH:
1744 cblistpush(words, (char *)(utext + i), j - i);
1745 break;
1746 case ESTEASTALPH:
1747 for(k = i; k < j; k += 2){
1748 if(j - k >= 4){
1749 cblistpush(words, (char *)(utext + k), 4);
1750 } else {
1751 cblistpush(words, (char *)(utext + k), 2);
1752 }
1753 }
1754 break;
1755 default:
1756 break;
1757 }
1758 i = j - 2;
1759 }
1760 for(i = 0; i < CB_LISTNUM(words); i++){
1761 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1762 if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
1763 i < CB_LISTNUM(words) - 1){
1764 next = (unsigned char *)cblistval(words, i + 1, &nsiz);
1765 if(nsiz > 4) nsiz = 4;
1766 if(est_char_category(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
1767 CB_MALLOC(tmp, wsiz + nsiz + 1);
1768 memcpy(tmp, word, wsiz);
1769 memcpy(tmp + wsiz, next, nsiz);
1770 cblistover(words, i, tmp, wsiz + nsiz);
1771 free(tmp);
1772 }
1773 }
1774 for(i = 0; i < CB_LISTNUM(words); i++){
1775 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1776 if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
1777 if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
1778 }
1779 tmp = est_uconv_out((char *)word, wsiz, &tsiz);
1780 cblistpushbuf(list, tmp, tsiz);
1781 }
1782 cblistclose(words);
1783 free(utext);
1784 }
1785
1786
1787 /* Break a sentence of text and extract words using perfect N-gram analyzer. */
1788 void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail){
1789 CBLIST *words;
1790 const unsigned char *word, *next;
1791 unsigned char *utext;
1792 char *tmp;
1793 int i, j, k, size, cc, wsiz, nsiz, tsiz;
1794 assert(text);
1795 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
1796 if(norm) est_normalize_text(utext, size, &size);
1797 est_canonicalize_text(utext, size, FALSE);
1798 words = cblistopen();
1799 for(i = 0; i < size; i += 2){
1800 cc = est_char_category_perfng(utext[i] * 0x100 + utext[i+1]);
1801 for(j = i + 2; j < size; j += 2){
1802 if(est_char_category_perfng(utext[j] * 0x100 + utext[j+1]) != cc) break;
1803 }
1804 switch(cc){
1805 case ESTEASTALPH:
1806 for(k = i; k < j; k += 2){
1807 if(j - k >= 4){
1808 cblistpush(words, (char *)(utext + k), 4);
1809 } else {
1810 cblistpush(words, (char *)(utext + k), 2);
1811 }
1812 }
1813 break;
1814 default:
1815 break;
1816 }
1817 i = j - 2;
1818 }
1819 for(i = 0; i < CB_LISTNUM(words); i++){
1820 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1821 if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
1822 i < CB_LISTNUM(words) - 1){
1823 next = (unsigned char *)cblistval(words, i + 1, &nsiz);
1824 if(nsiz > 4) nsiz = 4;
1825 if(est_char_category_perfng(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
1826 CB_MALLOC(tmp, wsiz + nsiz + 1);
1827 memcpy(tmp, word, wsiz);
1828 memcpy(tmp + wsiz, next, nsiz);
1829 cblistover(words, i, tmp, wsiz + nsiz);
1830 free(tmp);
1831 }
1832 }
1833 for(i = 0; i < CB_LISTNUM(words); i++){
1834 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1835 if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
1836 if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
1837 }
1838 tmp = est_uconv_out((char *)word, wsiz, &tsiz);
1839 cblistpushbuf(list, tmp, tsiz);
1840 }
1841 cblistclose(words);
1842 free(utext);
1843 }
1844
1845
1846 /* Convert the character encoding of a string. */
1847 char *est_iconv(const char *ptr, int size,
1848 const char *icode, const char *ocode, int *sp, int *mp){
1849 iconv_t ic;
1850 char *obuf, *wp, *rp;
1851 size_t isiz, osiz;
1852 int miss;
1853 assert(ptr && icode && ocode);
1854 if(size < 0) size = strlen(ptr);
1855 if(icode[0] == 'x' && icode[1] == '-'){
1856 if(!cbstricmp(icode, "x-sjis")){
1857 icode = "Shift_JIS";
1858 } else if(!cbstricmp(icode, "x-ujis")){
1859 icode = "EUC-JP";
1860 } else if(!cbstricmp(icode, "x-euc-jp")){
1861 icode = "EUC-JP";
1862 }
1863 } else if(icode[0] == 'w' || icode[0] == 'W'){
1864 if(!cbstricmp(icode, "windows-31j")){
1865 icode = "CP932";
1866 }
1867 }
1868 if(ocode[0] == 'x' && ocode[1] == '-'){
1869 if(!cbstricmp(ocode, "x-sjis")){
1870 ocode = "Shift_JIS";
1871 } else if(!cbstricmp(ocode, "x-ujis")){
1872 ocode = "EUC-JP";
1873 } else if(!cbstricmp(ocode, "x-euc-jp")){
1874 ocode = "EUC-JP";
1875 }
1876 } else if(ocode[0] == 'w' || ocode[0] == 'W'){
1877 if(!cbstricmp(ocode, "windows-31j")){
1878 ocode = "CP932";
1879 }
1880 }
1881 if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return NULL;
1882 isiz = size;
1883 osiz = isiz * 5;
1884 CB_MALLOC(obuf, osiz + 1);
1885 wp = obuf;
1886 rp = (char *)ptr;
1887 miss = 0;
1888 while(isiz > 0){
1889 if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
1890 if(errno == EILSEQ && (*rp == 0x5c || *rp == 0x7e)){
1891 *wp = *rp;
1892 wp++;
1893 rp++;
1894 isiz--;
1895 } else if(errno == EILSEQ || errno == EINVAL){
1896 rp++;
1897 isiz--;
1898 miss++;
1899 } else {
1900 break;
1901 }
1902 }
1903 }
1904 *wp = '\0';
1905 if(sp) *sp = wp - obuf;
1906 if(mp) *mp = miss;
1907 if(iconv_close(ic) == -1){
1908 free(obuf);
1909 return NULL;
1910 }
1911 return obuf;
1912 }
1913
1914
1915 /* Detect the encoding of a string automatically. */
1916 const char *est_enc_name(const char *ptr, int size, int plang){
1917 const char *hypo;
1918 int i, miss, cr;
1919 assert(ptr);
1920 if(size < 0) size = strlen(ptr);
1921 if(size > ESTICCHECKSIZ) size = ESTICCHECKSIZ;
1922 if(size >= 2 && (!memcmp(ptr, "\xfe\xff", 2) || !memcmp(ptr, "\xff\xfe", 2))) return "UTF-16";
1923 for(i = 0; i < size - 1; i += 2){
1924 if(ptr[i] == 0 && ptr[i+1] != 0) return "UTF-16BE";
1925 if(ptr[i+1] == 0 && ptr[i] != 0) return "UTF-16LE";
1926 }
1927 switch(plang){
1928 case ESTLANGEN:
1929 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1930 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1931 return "ISO-8859-1";
1932 case ESTLANGJA:
1933 for(i = 0; i < size - 3; i++){
1934 if(ptr[i] == 0x1b){
1935 i++;
1936 if(ptr[i] == '(' && strchr("BJHI", ptr[i+1])) return "ISO-2022-JP";
1937 if(ptr[i] == '$' && strchr("@B(", ptr[i+1])) return "ISO-2022-JP";
1938 }
1939 }
1940 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1941 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1942 hypo = NULL;
1943 cr = FALSE;
1944 for(i = 0; i < size; i++){
1945 if(ptr[i] == 0xd){
1946 cr = TRUE;
1947 break;
1948 }
1949 }
1950 if(cr){
1951 if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
1952 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
1953 if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
1954 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
1955 } else {
1956 if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
1957 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
1958 if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
1959 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
1960 }
1961 if((miss = est_enc_miss(ptr, size, "UTF-8", "UTF-16BE")) < 1) return "UTF-8";
1962 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "UTF-8";
1963 if((miss = est_enc_miss(ptr, size, "CP932", "UTF-16BE")) < 1) return "CP932";
1964 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "CP932";
1965 return hypo ? hypo : "ISO-8859-1";
1966 case ESTLANGZH:
1967 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1968 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1969 if(est_enc_miss(ptr, size, "EUC-CN", "UTF-16BE") < 1) return "EUC-CN";
1970 if(est_enc_miss(ptr, size, "BIG5", "UTF-16BE") < 1) return "BIG5";
1971 return "ISO-8859-1";
1972 case ESTLANGKO:
1973 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1974 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1975 if(est_enc_miss(ptr, size, "EUC-KR", "UTF-16BE") < 1) return "EUC-KR";
1976 return "ISO-8859-1";
1977 default:
1978 break;
1979 }
1980 return "ISO-8859-1";
1981 }
1982
1983
1984 /* Convert a UTF-8 string into UTF-16BE. */
1985 char *est_uconv_in(const char *ptr, int size, int *sp){
1986 const unsigned char *rp;
1987 char *rbuf, *wp;
1988 assert(ptr && size >= 0 && sp);
1989 rp = (unsigned char *)ptr;
1990 CB_MALLOC(rbuf, size * 2 + 1);
1991 wp = rbuf;
1992 while(rp < (unsigned char *)ptr + size){
1993 if(*rp < 0x7f){
1994 *(wp++) = 0x00;
1995 *(wp++) = *rp;
1996 rp += 1;
1997 } else if(*rp < 0xdf){
1998 if(rp >= (unsigned char *)ptr + size - 1) break;
1999 *(wp++) = (rp[0] & 0x1f) >> 2;
2000 *(wp++) = (rp[0] << 6) | (rp[1] & 0x3f);
2001 rp += 2;
2002 } else if(*rp < 0xf0){
2003 if(rp >= (unsigned char *)ptr + size - 2) break;
2004 *(wp++) = (rp[0] << 4) | ((rp[1] & 0x3f) >> 2);
2005 *(wp++) = (rp[1] << 6) | (rp[2] & 0x3f);
2006 rp += 3;
2007 } else if(*rp < 0xf8){
2008 if(rp >= (unsigned char *)ptr + size - 3) break;
2009 *(wp++) = 0x00;
2010 *(wp++) = '?';
2011 rp += 4;
2012 } else if(*rp < 0xfb){
2013 if(rp >= (unsigned char *)ptr + size - 4) break;
2014 *(wp++) = 0x00;
2015 *(wp++) = '?';
2016 rp += 5;
2017 } else if(*rp < 0xfd){
2018 if(rp >= (unsigned char *)ptr + size - 5) break;
2019 *(wp++) = 0x00;
2020 *(wp++) = '?';
2021 rp += 6;
2022 } else {
2023 break;
2024 }
2025 }
2026 *wp = '\0';
2027 *sp = wp - rbuf;
2028 return rbuf;
2029 }
2030
2031
2032 /* Convert a UTF-16BE string into UTF-8. */
2033 char *est_uconv_out(const char *ptr, int size, int *sp){
2034 const unsigned char *rp;
2035 char *rbuf, *wp;
2036 int c;
2037 assert(ptr && size >= 0);
2038 if(size % 2 != 0) size--;
2039 rp = (unsigned char *)ptr;
2040 CB_MALLOC(rbuf, size * 2 + 1);
2041 wp = rbuf;
2042 while(rp < (unsigned char *)ptr + size){
2043 c = rp[0] * 0x100 + rp[1];
2044 if(c < 0x0080){
2045 *(wp++) = rp[1];
2046 } else if(c < 0x0900){
2047 *(wp++) = 0xc0 | (rp[0] << 2) | ((rp[1] >> 6) & 0x03);
2048 *(wp++) = 0x80 | (rp[1] & 0x3f);
2049 } else {
2050 *(wp++) = 0xe0 | ((rp[0] >> 4) & 0x0f);
2051 *(wp++) = 0x80 | ((rp[0] & 0x0f) << 2) | ((rp[1] >> 6) & 0x03);
2052 *(wp++) = 0x80 | (rp[1] & 0x3f);
2053 }
2054 rp += 2;
2055 }
2056 *wp = '\0';
2057 if(sp) *sp = wp - rbuf;
2058 return rbuf;
2059 }
2060
2061
2062 /* Compress a serial object with ZLIB. */
2063 char *est_deflate(const char *ptr, int size, int *sp){
2064 z_stream zs;
2065 char *buf;
2066 unsigned char obuf[ESTIOBUFSIZ];
2067 int rv, asiz, bsiz, osiz;
2068 assert(ptr && sp);
2069 if(size < 0) size = strlen(ptr);
2070 zs.zalloc = Z_NULL;
2071 zs.zfree = Z_NULL;
2072 zs.opaque = Z_NULL;
2073 if(deflateInit(&zs, ESTZCOMPLEVEL) != Z_OK) return NULL;
2074 asiz = ESTIOBUFSIZ;
2075 CB_MALLOC(buf, asiz);
2076 bsiz = 0;
2077 zs.next_in = (unsigned char *)ptr;
2078 zs.avail_in = size;
2079 zs.next_out = obuf;
2080 zs.avail_out = ESTIOBUFSIZ;
2081 while((rv = deflate(&zs, Z_FINISH)) == Z_OK){
2082 osiz = ESTIOBUFSIZ - zs.avail_out;
2083 if(bsiz + osiz > asiz){
2084 asiz = asiz * 2 + osiz;
2085 CB_REALLOC(buf, asiz);
2086 }
2087 memcpy(buf + bsiz, obuf, osiz);
2088 bsiz += osiz;
2089 zs.next_out = obuf;
2090 zs.avail_out = ESTIOBUFSIZ;
2091 }
2092 if(rv != Z_STREAM_END){
2093 free(buf);
2094 deflateEnd(&zs);
2095 return NULL;
2096 }
2097 osiz = ESTIOBUFSIZ - zs.avail_out;
2098 if(bsiz + osiz > asiz){
2099 asiz = asiz * 2 + osiz;
2100 CB_REALLOC(buf, asiz);
2101 }
2102 memcpy(buf + bsiz, obuf, osiz);
2103 bsiz += osiz;
2104 *sp = bsiz;
2105 deflateEnd(&zs);
2106 return buf;
2107 }
2108
2109
2110 /* Decompress a serial object compressed with ZLIB. */
2111 char *est_inflate(const char *ptr, int size, int *sp){
2112 z_stream zs;
2113 char *buf;
2114 unsigned char obuf[ESTIOBUFSIZ];
2115 int rv, asiz, bsiz, osiz;
2116 assert(ptr && size >= 0 && sp);
2117 zs.zalloc = Z_NULL;
2118 zs.zfree = Z_NULL;
2119 zs.opaque = Z_NULL;
2120 if(inflateInit(&zs) != Z_OK) return NULL;
2121 asiz = ESTIOBUFSIZ;
2122 CB_MALLOC(buf, asiz);
2123 bsiz = 0;
2124 zs.next_in = (unsigned char *)ptr;
2125 zs.avail_in = size;
2126 zs.next_out = obuf;
2127 zs.avail_out = ESTIOBUFSIZ;
2128 while((rv = inflate(&zs, Z_NO_FLUSH)) == Z_OK){
2129 osiz = ESTIOBUFSIZ - zs.avail_out;
2130 if(bsiz + osiz >= asiz){
2131 asiz = asiz * 2 + osiz;
2132 CB_REALLOC(buf, asiz);
2133 }
2134 memcpy(buf + bsiz, obuf, osiz);
2135 bsiz += osiz;
2136 zs.next_out = obuf;
2137 zs.avail_out = ESTIOBUFSIZ;
2138 }
2139 if(rv != Z_STREAM_END){
2140 free(buf);
2141 inflateEnd(&zs);
2142 return NULL;
2143 }
2144 osiz = ESTIOBUFSIZ - zs.avail_out;
2145 if(bsiz + osiz >= asiz){
2146 asiz = asiz * 2 + osiz;
2147 CB_REALLOC(buf, asiz);
2148 }
2149 memcpy(buf + bsiz, obuf, osiz);
2150 bsiz += osiz;
2151 buf[bsiz] = '\0';
2152 if(sp) *sp = bsiz;
2153 inflateEnd(&zs);
2154 return buf;
2155 }
2156
2157
2158 /* Get the border string for draft data of documents. */
2159 const char *est_border_str(void){
2160 static int first = TRUE;
2161 static char border[ESTPATHBUFSIZ];
2162 int t, p;
2163 if(first){
2164 t = (int)(time(NULL) + est_random() * INT_MAX);
2165 p = (int)(getpid() + est_random() * INT_MAX);
2166 sprintf(border, "--------[%08X%08X]--------",
2167 dpouterhash((char *)&t, sizeof(int)), dpouterhash((char *)&p, sizeof(int)));
2168 first = FALSE;
2169 }
2170 return border;
2171 }
2172
2173
2174 /* Get the real random number. */
2175 double est_random(void){
2176 static int first = TRUE;
2177 int num;
2178 if(first && !est_random_ifp){
2179 if((est_random_ifp = fopen("/dev/urandom", "rb")) != NULL){
2180 atexit(est_random_fclose);
2181 } else {
2182 srand(getpid());
2183 }
2184 first = FALSE;
2185 }
2186 if(est_random_ifp){
2187 fread(&num, sizeof(int), 1, est_random_ifp);
2188 return (num & 0x7fffffff) / (double)0x7fffffff;
2189 }
2190 return rand() / (double)RAND_MAX;
2191 }
2192
2193
2194 /* Get the random number in normal distribution. */
2195 double est_random_nd(void){
2196 double d;
2197 d = (sqrt(-2 * log(1.0 - est_random())) * cos(3.1415926535 * 2 * est_random()) + 6.0) / 12.0;
2198 if(d > 1.0) d = 1.0;
2199 if(d < 0.0) d = 0.0;
2200 return d;
2201 }
2202
2203
2204 /* Get an MD5 hash string of a key string. */
2205 char *est_make_crypt(const char *key){
2206 md5_state_t ms;
2207 char digest[32], str[64], *wp;
2208 int i;
2209 assert(key);
2210 md5_init(&ms);
2211 md5_append(&ms, (md5_byte_t *)key, strlen(key));
2212 md5_finish(&ms, (md5_byte_t *)digest);
2213 wp = str;
2214 for(i = 0; i < 16; i++){
2215 wp += sprintf(wp, "%02x", ((unsigned char *)digest)[i]);
2216 }
2217 return cbmemdup(str, -1);
2218 }
2219
2220
2221 /* Check whether a key matches an MD5 hash string. */
2222 int est_match_crypt(const char *key, const char *hash){
2223 char *khash;
2224 int rv;
2225 assert(key && hash);
2226 khash = est_make_crypt(key);
2227 rv = !strcmp(khash, hash);
2228 free(khash);
2229 return rv;
2230 }
2231
2232
2233 /* Get the hidden texts of a document object. */
2234 const char *est_doc_hidden_texts(ESTDOC *doc){
2235 const char *rv;
2236 assert(doc);
2237 rv = doc->attrs ? cbmapget(doc->attrs, "", 0, NULL) : NULL;
2238 return rv ? rv : "";
2239 }
2240
2241
2242 /* Get the phrase of a condition object. */
2243 const char *est_cond_phrase(ESTCOND *cond){
2244 assert(cond);
2245 return cond->phrase;
2246 }
2247
2248
2249 /* Get a list object of attribute expressions of a condition object. */
2250 const CBLIST *est_cond_attrs(ESTCOND *cond){
2251 assert(cond);
2252 return cond->attrs;
2253 }
2254
2255
2256 /* Get the order expression of a condition object. */
2257 const char *est_cond_order(ESTCOND *cond){
2258 assert(cond);
2259 return cond->order;
2260 }
2261
2262
2263 /* Get the maximum number of retrieval of a condition object. */
2264 int est_cond_max(ESTCOND *cond){
2265 assert(cond);
2266 return cond->max;
2267 }
2268
2269
2270 /* Get the options of a condition object. */
2271 int est_cond_options(ESTCOND *cond){
2272 assert(cond);
2273 return cond->opts;
2274 }
2275
2276
2277 /* Get the score of a document corresponding to a condition object. */
2278 int est_cond_score(ESTCOND *cond, int index){
2279 assert(cond);
2280 if(!cond->scores || index < 0 || index >= cond->snum) return -1;
2281 return cond->scores[index];
2282 }
2283
2284
2285 /* Set the error code of a database. */
2286 void est_db_set_ecode(ESTDB *db, int ecode){
2287 assert(db);
2288 db->ecode = ecode;
2289 }
2290
2291
2292 /* Edit attributes of a document object in a database. */
2293 int est_db_edit_doc(ESTDB *db, ESTDOC *doc){
2294 const char *uri;
2295 char *sbuf;
2296 int err, id, ssiz;
2297 assert(db && doc);
2298 if(!dpwritable(db->metadb)){
2299 db->ecode = ESTEACCES;
2300 return FALSE;
2301 }
2302 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) || doc->id < 1){
2303 db->ecode = ESTEINVAL;
2304 return FALSE;
2305 }
2306 if((id = est_db_uri_to_id(db, uri)) > 0 && id != doc->id){
2307 db->ecode = ESTEINVAL;
2308 return FALSE;
2309 }
2310 err = FALSE;
2311 sbuf = cbmapdump(doc->attrs, &ssiz);
2312 if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DOVER)){
2313 db->ecode = ESTEDB;
2314 db->fatal = TRUE;
2315 err = TRUE;
2316 }
2317 free(sbuf);
2318 if(db->spacc) cbmapout(db->spacc, (char *)&(doc->id), sizeof(int));
2319 return err ? FALSE : TRUE;
2320 }
2321
2322
2323 /* Add a piece of meta data to a database. */
2324 void est_db_add_meta(ESTDB *db, const char *name, const char *value){
2325 assert(db && name);
2326 if(!dpwritable(db->metadb)){
2327 db->ecode = ESTEACCES;
2328 return;
2329 }
2330 if(!db->metacc) est_db_prepare_meta(db);
2331 if(value){
2332 cbmapput(db->metacc, name, -1, value, -1, TRUE);
2333 } else {
2334 cbmapout(db->metacc, name, -1);
2335 }
2336 }
2337
2338
2339 /* Get a list of names of meta data of a database. */
2340 CBLIST *est_db_meta_names(ESTDB *db){
2341 assert(db);
2342 if(!db->metacc) est_db_prepare_meta(db);
2343 return cbmapkeys(db->metacc);
2344 }
2345
2346
2347 /* Get the value of a piece of meta data of a database. */
2348 char *est_db_meta(ESTDB *db, const char *name){
2349 const char *vbuf;
2350 int vsiz;
2351 assert(db && name);
2352 if(!db->metacc) est_db_prepare_meta(db);
2353 if(!(vbuf = cbmapget(db->metacc, name, -1, &vsiz))) return NULL;
2354 return cbmemdup(vbuf, vsiz);
2355 }
2356
2357
2358 /* Get the number of records in the cache memory of a database. */
2359 int est_db_cache_num(ESTDB *db){
2360 assert(db);
2361 return cbmaprnum(db->idxcc);
2362 }
2363
2364
2365 /* Set the callback function for database events. */
2366 void est_db_set_informer(ESTDB *db, void (*func)(const char *)){
2367 assert(db && func);
2368 db->cbinfo = func;
2369 est_db_inform(db, "status");
2370 }
2371
2372
2373 /* Set the callback function to create a vector of keywords of a document. */
2374 void est_db_set_vectorizer(ESTDB *db, CBMAP *(*func)(void *, int, void *), void *data){
2375 assert(db && func);
2376 db->cbvec = func;
2377 db->vecdata = data;
2378 }
2379
2380
2381 /* Fill the cache for keys for TF-IDF. */
2382 void est_db_fill_key_cache(ESTDB *db){
2383 char *kbuf, *msg;
2384 int i, ksiz, vsiz;
2385 assert(db);
2386 vlcurfirst(db->fwmdb);
2387 for(i = 0; (kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL; i++){
2388 vsiz = est_idx_vsiz(db->idxdb, kbuf, ksiz);
2389 cbmapput(db->keycc, kbuf, ksiz, (char *)&vsiz, sizeof(int), TRUE);
2390 free(kbuf);
2391 vlcurnext(db->fwmdb);
2392 if(i % ESTCCCBFREQ == 0){
2393 msg = cbsprintf("filling the key cache for TF-IDF (%d)", i + 1);
2394 est_db_inform(db, msg);
2395 free(msg);
2396 }
2397 }
2398 db->kcmnum = -1;
2399 }
2400
2401
2402 /* Make a directory. */
2403 int est_mkdir(const char *path){
2404 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2405 return mkdir(path) == 0 ? TRUE : FALSE;
2406 #else
2407 assert(path);
2408 return mkdir(path, ESTDIRMODE) == 0 ? TRUE : FALSE;
2409 #endif
2410 }
2411
2412
2413 /* Remove a directory and its contents recursively. */
2414 int est_rmdir_rec(const char *path){
2415 CBLIST *files;
2416 const char *file;
2417 char pbuf[ESTPATHBUFSIZ];
2418 int i;
2419 assert(path);
2420 if((files = cbdirlist(path)) != NULL){
2421 for(i = 0; i < cblistnum(files); i++){
2422 file = cblistval(files, i, NULL);
2423 if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
2424 sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
2425 if(unlink(pbuf) == -1) est_rmdir_rec(pbuf);
2426 }
2427 cblistclose(files);
2428 }
2429 return rmdir(path) == 0;
2430 }
2431
2432
2433 /* Get the canonicalized absolute pathname of a file. */
2434 char *est_realpath(const char *path){
2435 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2436 char pbuf[ESTPATHBUFSIZ], *p;
2437 if(GetFullPathName(path, ESTPATHBUFSIZ, pbuf, &p) == 0) sprintf(pbuf, "%s", path);
2438 return cbmemdup(pbuf, -1);
2439 #else
2440 char pbuf[ESTPATHBUFSIZ*2];
2441 assert(path);
2442 if(!realpath(path, pbuf)) sprintf(pbuf, "%s", path);
2443 return cbmemdup(pbuf, -1);
2444 #endif
2445 }
2446
2447
2448 /* Get the time of day in milliseconds. */
2449 double est_gettimeofday(void){
2450 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2451 SYSTEMTIME st;
2452 struct tm ts;
2453 GetLocalTime(&st);
2454 memset(&ts, 0, sizeof(struct tm));
2455 ts.tm_year = st.wYear - 1900;
2456 ts.tm_mon = st.wMonth - 1;
2457 ts.tm_mday = st.wDay;
2458 ts.tm_hour = st.wHour;
2459 ts.tm_min = st.wMinute;
2460 ts.tm_sec = st.wSecond;
2461 return (double)mktime(&ts) * 1000 + (double)st.wMilliseconds;
2462 #else
2463 struct timeval tv;
2464 struct timezone tz;
2465 if(gettimeofday(&tv, &tz) == -1) return 0.0;
2466 return (double)tv.tv_sec * 1000 + (double)tv.tv_usec / 1000;
2467 #endif
2468 }
2469
2470
2471 /* Suspend execution for microsecond intervals. */
2472 void est_usleep(unsigned long usec){
2473 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2474 Sleep(usec / 1000);
2475 #else
2476 usleep(usec);
2477 #endif
2478 }
2479
2480
2481 /* Send a signal to a process. */
2482 int est_kill(int pid, int sig){
2483 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2484 return FALSE;
2485 #else
2486 return kill(pid, sig) == 0;
2487 #endif
2488 }
2489
2490
2491 /* get the media type of an extention */
2492 const char *est_ext_type(const char *ext){
2493 static const char *list[] = {
2494 ".txt", "text/plain", ".txt.en", "text/plain",
2495 ".txt.ja", "text/plain", ".asc", "text/plain",
2496 ".in", "text/plain", ".c", "text/plain",
2497 ".h", "text/plain", ".cc", "text/plain",
2498 ".java", "text/plain", ".sh", "text/plain",
2499 ".pl", "text/plain", ".py", "text/plain",
2500 ".rb", "text/plain", ".idl", "text/plain",
2501 ".csv", "text/plain", ".log", "text/plain",
2502 ".conf", "text/plain", ".rc", "text/plain",
2503 ".ini", "text/plain", ".html", "text/html",
2504 ".htm", "text/html", ".xhtml", "text/html",
2505 ".xht", "text/html", ".css", "text/css",
2506 ".js", "text/javascript", ".tsv", "text/tab-separated-values",
2507 ".eml", "message/rfc822", ".mime", "message/rfc822",
2508 ".mht", "message/rfc822", ".mhtml", "message/rfc822",
2509 ".sgml", "application/sgml", ".sgm", "application/sgml",
2510 ".xml", "application/xml", ".xsl", "application/xml",
2511 ".xslt", "application/xslt+xml", ".xhtml", "application/xhtml+xml",
2512 ".xht", "application/xhtml+xml", ".rdf", "application/rdf+xml",
2513 ".rss", "application/rss+xml", ".dtd", "application/xml-dtd",
2514 ".rtf", "application/rtf", ".pdf", "application/pdf",
2515 ".ps", "application/postscript", ".eps", "application/postscript",
2516 ".doc", "application/msword", ".xls", "application/vnd.ms-excel",
2517 ".ppt", "application/vnd.ms-powerpoint", ".xdw", "application/vnd.fujixerox.docuworks",
2518 ".swf", "application/x-shockwave-flash", ".zip", "application/zip",
2519 ".tar", "application/x-tar", ".gz", "application/x-gzip",
2520 ".bz2", "application/octet-stream", ".z", "application/octet-stream",
2521 ".lha", "application/octet-stream", ".lzh", "application/octet-stream",
2522 ".cab", "application/octet-stream", ".rar", "application/octet-stream",
2523 ".sit", "application/octet-stream", ".bin", "application/octet-stream",
2524 ".o", "application/octet-stream", ".so", "application/octet-stream",
2525 ".exe", "application/octet-stream", ".dll", "application/octet-stream",
2526 ".class", "application/octet-stream", ".png", "image/png",
2527 ".gif", "image/gif", ".jpg", "image/jpeg",
2528 ".jpeg", "image/jpeg", ".tif", "image/tiff",
2529 ".tiff", "image/tiff", ".bmp", "image/bmp",
2530 ".au", "audio/basic", ".snd", "audio/basic",
2531 ".mid", "audio/midi", ".midi", "audio/midi",
2532 ".mp2", "audio/mpeg", ".mp3", "audio/mpeg",
2533 ".wav", "audio/x-wav", ".mpg", "video/mpeg",
2534 ".mpeg", "video/mpeg", ".qt", "video/quicktime",
2535 ".mov", "video/quicktime", ".avi", "video/x-msvideo",
2536 NULL
2537 };
2538 int i;
2539 assert(ext);
2540 for(i = 0; list[i]; i++){
2541 if(!cbstricmp(ext, list[i])) return list[i+1];
2542 }
2543 return "application/octet-stream";
2544 }
2545
2546
2547
2548 /*************************************************************************************************
2549 * private objects
2550 *************************************************************************************************/
2551
2552
2553 /* Count the number of missing characters when converting.
2554 `ptr' specifies the pointer to a region.
2555 `size' specifies the size of the region.
2556 `icode' specifies the name of encoding of the input string.
2557 `ocode' specifies the name of encoding of the output string.
2558 The return value is the number of missing characters. */
2559 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode){
2560 iconv_t ic;
2561 char obuf[ESTICCHECKSIZ], *wp, *rp;
2562 size_t isiz, osiz;
2563 int miss;
2564 assert(ptr && size >= 0 && icode && ocode);
2565 isiz = size;
2566 if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ESTICMISSMAX;
2567 miss = 0;
2568 rp = (char *)ptr;
2569 while(isiz > 0){
2570 osiz = ESTICCHECKSIZ;
2571 wp = obuf;
2572 if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
2573 if(errno == EILSEQ || errno == EINVAL){
2574 rp++;
2575 isiz--;
2576 miss++;
2577 if(miss >= ESTICMISSMAX) break;
2578 } else {
2579 break;
2580 }
2581 }
2582 }
2583 if(iconv_close(ic) == -1) return ESTICMISSMAX;
2584 return miss;
2585 }
2586
2587
2588 /* Normalize a text.
2589 `utext' specifies a text whose encoding is UTF-16BE.
2590 `size' specifies the size of the text.
2591 `sp' specifies the pointer to a variable to which the size of the result is assigned. */
2592 static void est_normalize_text(unsigned char *utext, int size, int *sp){
2593 int i, wi;
2594 assert(utext && size >= 0 && sp);
2595 wi = 0;
2596 for(i = 0; i < size - 1; i += 2){
2597 if(utext[i] == 0x0 && (utext[i+1] <= 0x8 || (utext[i+1] >= 0x0e && utext[i+1] <= 0x1f))){
2598 /* control characters */
2599 utext[wi] = 0x0;
2600 utext[wi+1] = 0x20;
2601 } else if(utext[i] == 0x0 && utext[i+1] == 0xa0){
2602 /* no-break space */
2603 utext[wi] = 0x0;
2604 utext[wi+1] = 0x20;
2605 } else if(utext[i] == 0x20 && utext[i+1] == 0x2){
2606 /* en space */
2607 utext[wi] = 0x0;
2608 utext[wi+1] = 0x20;
2609 } else if(utext[i] == 0x20 && utext[i+1] == 0x3){
2610 /* em space */
2611 utext[wi] = 0x0;
2612 utext[wi+1] = 0x20;
2613 } else if(utext[i] == 0x20 && utext[i+1] == 0x9){
2614 /* thin space */
2615 utext[wi] = 0x0;
2616 utext[wi+1] = 0x20;
2617 } else if(utext[i] == 0x30 && utext[i+1] == 0x0){
2618 /* fullwidth space */
2619 utext[wi] = 0x0;
2620 utext[wi+1] = 0x20;
2621 } else if(utext[i] == 0xff){
2622 if(utext[i+1] >= 0x21 && utext[i+1] <= 0x3a){
2623 /* fullwidth alphabets */
2624 utext[wi] = 0x0;
2625 utext[wi+1] = utext[i+1] - 0x21 + 0x41;
2626 } else if(utext[i+1] >= 0x41 && utext[i+1] <= 0x5a){
2627 /* fullwidth small alphabets */
2628 utext[wi] = 0x0;
2629 utext[wi+1] = utext[i+1] - 0x41 + 0x61;
2630 } else if(utext[i+1] >= 0x10 && utext[i+1] <= 0x19){
2631 /* fullwidth numbers */
2632 utext[wi] = 0x0;
2633 utext[wi+1] = utext[i+1] - 0x10 + 0x30;
2634 } else if(utext[i+1] == 0x61){
2635 /* halfwidth full stop */
2636 utext[wi] = 0x30;
2637 utext[wi+1] = 0x2;
2638 } else if(utext[i+1] == 0x62){
2639 /* halfwidth left corner */
2640 utext[wi] = 0x30;
2641 utext[wi+1] = 0xc;
2642 } else if(utext[i+1] == 0x63){
2643 /* halfwidth right corner */
2644 utext[wi] = 0x30;
2645 utext[wi+1] = 0xd;
2646 } else if(utext[i+1] == 0x64){
2647 /* halfwidth comma */
2648 utext[wi] = 0x30;
2649 utext[wi+1] = 0x1;
2650 } else if(utext[i+1] == 0x65){
2651 /* halfwidth middle dot */
2652 utext[wi] = 0x30;
2653 utext[wi+1] = 0xfb;
2654 } else if(utext[i+1] == 0x66){
2655 /* halfwidth wo */
2656 utext[wi] = 0x30;
2657 utext[wi+1] = 0xf2;
2658 } else if(utext[i+1] >= 0x67 && utext[i+1] <= 0x6b){
2659 /* halfwidth small a-o */
2660 utext[wi] = 0x30;
2661 utext[wi+1] = (utext[i+1] - 0x67) * 2 + 0xa1;
2662 } else if(utext[i+1] >= 0x6c && utext[i+1] <= 0x6e){
2663 /* halfwidth small ya-yo */
2664 utext[wi] = 0x30;
2665 utext[wi+1] = (utext[i+1] - 0x6c) * 2 + 0xe3;
2666 } else if(utext[i+1] == 0x6f){
2667 /* halfwidth small tu */
2668 utext[wi] = 0x30;
2669 utext[wi+1] = 0xc3;
2670 } else if(utext[i+1] == 0x70){
2671 /* halfwidth prolonged mark */
2672 utext[wi] = 0x30;
2673 utext[wi+1] = 0xfc;
2674 } else if(utext[i+1] >= 0x71 && utext[i+1] <= 0x75){
2675 /* halfwidth a-o */
2676 utext[wi] = 0x30;
2677 utext[wi+1] = (utext[i+1] - 0x71) * 2 + 0xa2;
2678 if(i + 2 < size - 1 && utext[i+1] == 0x73 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2679 utext[wi+1] = 0xf4;
2680 i += 2;
2681 }
2682 } else if(utext[i+1] >= 0x76 && utext[i+1] <= 0x7a){
2683 /* halfwidth ka-ko */
2684 utext[wi] = 0x30;
2685 utext[wi+1] = (utext[i+1] - 0x76) * 2 + 0xab;
2686 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2687 utext[wi+1] += 1;
2688 i += 2;
2689 }
2690 } else if(utext[i+1] >= 0x7b && utext[i+1] <= 0x7f){
2691 /* halfwidth sa-so */
2692 utext[wi] = 0x30;
2693 utext[wi+1] = (utext[i+1] - 0x7b) * 2 + 0xb5;
2694 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2695 utext[wi+1] += 1;
2696 i += 2;
2697 }
2698 } else if(utext[i+1] >= 0x80 && utext[i+1] <= 0x84){
2699 /* halfwidth ta-to */
2700 utext[wi] = 0x30;
2701 utext[wi+1] = (utext[i+1] - 0x80) * 2 + 0xbf + (utext[i+1] >= 0x82 ? 1 : 0);
2702 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2703 utext[wi+1] += 1;
2704 i += 2;
2705 }
2706 } else if(utext[i+1] >= 0x85 && utext[i+1] <= 0x89){
2707 /* halfwidth na-no */
2708 utext[wi] = 0x30;
2709 utext[wi+1] = utext[i+1] - 0x85 + 0xca;
2710 } else if(utext[i+1] >= 0x8a && utext[i+1] <= 0x8e){
2711 /* halfwidth ha-ho */
2712 utext[wi] = 0x30;
2713 utext[wi+1] = (utext[i+1] - 0x8a) * 3 + 0xcf;
2714 if(i + 2 < size - 1){
2715 if(utext[i+2] == 0xff && utext[i+3] == 0x9e){
2716 utext[wi+1] += 1;
2717 i += 2;
2718 } else if(utext[i+2] == 0xff && utext[i+3] == 0x9f){
2719 utext[wi+1] += 2;
2720 i += 2;
2721 }
2722 }
2723 } else if(utext[i+1] >= 0x8f && utext[i+1] <= 0x93){
2724 /* halfwidth ma-mo */
2725 utext[wi] = 0x30;
2726 utext[wi+1] = utext[i+1] - 0x8f + 0xde;
2727 } else if(utext[i+1] >= 0x94 && utext[i+1] <= 0x96){
2728 /* halfwidth ya-yo */
2729 utext[wi] = 0x30;
2730 utext[wi+1] = (utext[i+1] - 0x94) * 2 + 0xe4;
2731 } else if(utext[i+1] >= 0x97 && utext[i+1] <= 0x9b){
2732 /* halfwidth ra-ro */
2733 utext[wi] = 0x30;
2734 utext[wi+1] = utext[i+1] - 0x97 + 0xe9;
2735 } else if(utext[i+1] == 0x9c){
2736 /* halfwidth wa */
2737 utext[wi] = 0x30;
2738 utext[wi+1] = 0xef;
2739 } else if(utext[i+1] == 0x9d){
2740 /* halfwidth wo */
2741 utext[wi] = 0x30;
2742 utext[wi+1] = 0xf3;
2743 } else {
2744 utext[wi] = utext[i];
2745 utext[wi+1] = utext[i+1];
2746 }
2747 } else {
2748 utext[wi] = utext[i];
2749 utext[wi+1] = utext[i+1];
2750 }
2751 wi += 2;
2752 }
2753 *sp = wi;
2754 }
2755
2756
2757 /* Canonicalize a text for search keys.
2758 `utext' specifies a text whose encoding is UTF-16BE.
2759 `size' specifies the size of the text.
2760 `funcspc' specifies whether to allow functional space characters. */
2761 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc){
2762 int i;
2763 for(i = 0; i < size; i += 2){
2764 if(utext[i] == 0x0){
2765 if(utext[i+1] >= 'A' && utext[i+1] <= 'Z'){
2766 /* ascii */
2767 utext[i+1] += 'a' - 'A';
2768 } else if((utext[i+1] >= 0xc0 && utext[i+1] <= 0xd6) ||
2769 (utext[i+1] >= 0xd8 && utext[i+1] <= 0xde)){
2770 /* latin-1 supplement */
2771 utext[i+1] += 0x20;
2772 } else if(!funcspc && utext[i+1] < ' '){
2773 /* functional spaces */
2774 utext[i+1] = ' ';
2775 }
2776 } else if(utext[i] == 0x1){
2777 if((utext[i+1] <= 0x36 && utext[i+1] % 2 == 0) ||
2778 (utext[i+1] >= 0x39 && utext[i+1] <= 0x47 && utext[i+1] % 2 == 1) ||
2779 (utext[i+1] >= 0x4a && utext[i+1] <= 0x76 && utext[i+1] % 2 == 0) ||
2780 (utext[i+1] >= 0x79 && utext[i+1] <= 0x7d && utext[i+1] % 2 == 1)){
2781 /* latin extended-a */
2782 utext[i+1] += 0x1;
2783 } else if(utext[i+1] == 0x78){
2784 /* y with umlaut */
2785 utext[i] = 0x0;
2786 utext[i+1] = 0xff;
2787 }
2788 } else if(utext[i] == 0x3){
2789 if(utext[i+1] >= 0x91 && utext[i+1] <= 0xa9){
2790 /* greek */
2791 utext[i+1] += 0x20;
2792 }
2793 } else if(utext[i] == 0x4){
2794 if(utext[i+1] >= 0x10 && utext[i+1] <= 0x2f){
2795 /* cyrillic */
2796 utext[i+1] += 0x20;
2797 } else if(utext[i+1] <= 0x0f){
2798 /* cyrillic with mark */
2799 utext[i+1] += 0x50;
2800 }
2801 } else if(utext[i] == 0xff){
2802 if(utext[i] >= 0xf0){
2803 /* special */
2804 utext[i] = 0x0;
2805 utext[i+1] = ' ';
2806 }
2807 }
2808 }
2809 }
2810
2811
2812 /* Categorize a character.
2813 `c' specifies the UCS number of a character.
2814 The return value is the category of the character. */
2815 static int est_char_category(int c){
2816 /* ascii space */
2817 if(c <= 0x0020) return ESTSPACECHR;
2818 /* ascii alnum */
2819 if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
2820 (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
2821 /* latin */
2822 if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
2823 return ESTWESTALPH;
2824 /* arabic and syrian */
2825 if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
2826 /* south and south east asia */
2827 if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
2828 /* cjk */
2829 if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x2e80 && c <= 0xd7af) ||
2830 (c >= 0xf900 && c <= 0xfaff) || (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
2831 /* asian presentation forms */
2832 if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
2833 (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
2834 /* others */
2835 return ESTDELIMCHR;
2836 }
2837
2838
2839 /* Categorize a character for perfect N-gram analyzer.
2840 `c' specifies the UCS number of a character.
2841 The return value is the category of the character. */
2842 static int est_char_category_perfng(int c){
2843 if(c <= 0x0020) return ESTSPACECHR;
2844 return ESTEASTALPH;
2845 }
2846
2847
2848 /* Convert a simplified phrase into complete form.
2849 `sphrase' specifies a simplified phrase.
2850 The return value is the complete form of the phrase. */
2851 static char *est_phrase_from_thumb(const char *sphrase){
2852 CBDATUM *datum;
2853 const char *oper, *rp;
2854 unsigned char *utext;
2855 char *rtext;
2856 int size, quote;
2857 assert(sphrase);
2858 datum = cbdatumopen("", 0);
2859 utext = (unsigned char *)est_uconv_in(sphrase, strlen(sphrase), &size);
2860 est_normalize_text(utext, size, &size);
2861 est_canonicalize_text(utext, size, FALSE);
2862 rtext = est_uconv_out((char *)utext, size, NULL);
2863 cbstrsqzspc(rtext);
2864 quote = FALSE;
2865 oper = NULL;
2866 for(rp = rtext; *rp != '\0'; rp++){
2867 if(*rp == '"'){
2868 if(oper){
2869 cbdatumcat(datum, oper, -1);
2870 oper = NULL;
2871 }
2872 quote = !quote;
2873 continue;
2874 }
2875 if(quote){
2876 cbdatumcat(datum, rp, 1);
2877 continue;
2878 }
2879 switch(*rp){
2880 case ' ':
2881 if(!oper) oper = " AND ";
2882 break;
2883 case '&':
2884 oper = " AND ";
2885 break;
2886 case '|':
2887 oper = " OR ";
2888 break;
2889 case '!':
2890 oper = " ANDNOT ";
2891 break;
2892 default:
2893 if(oper){
2894 cbdatumcat(datum, oper, -1);
2895 oper = NULL;
2896 }
2897 cbdatumcat(datum, rp, 1);
2898 }
2899 }
2900 free(rtext);
2901 free(utext);
2902 return cbdatumtomalloc(datum, NULL);
2903 }
2904
2905
2906 /* Add a string to a snippet.
2907 `rtext' specifies a raw text.
2908 `ctext' specifies a canonicalized text.
2909 `size' specifies the size of the raw text and the canonicalized text.
2910 `awsiz' specifies the size of allowance for matching words.
2911 `res' specifies a datum object for the result.
2912 `rwords' specifies a list object of raw words. */
2913 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
2914 int size, int awsiz, CBDATUM *res, const CBLIST *rwords){
2915 const unsigned char *rword;
2916 char *orig;
2917 int i, j, bi, rwsiz, step, osiz;
2918 bi = 0;
2919 for(i = 0; i < size; i += 2){
2920 for(j = 0; j < CB_LISTNUM(rwords); j++){
2921 rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
2922 if((step = est_str_fwmatch_wide(ctext + i, size + awsiz - i, rword, rwsiz)) > 0){
2923 if(i - bi > 0){
2924 orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
2925 cbdatumcat(res, orig, osiz);
2926 cbdatumcat(res, "\n", 1);
2927 free(orig);
2928 }
2929 orig = est_uconv_out((char *)rtext + i, step, &osiz);
2930 cbdatumcat(res, orig, osiz);
2931 free(orig);
2932 cbdatumcat(res, "\t", 1);
2933 orig = est_uconv_out((char *)rword, rwsiz, &osiz);
2934 cbdatumcat(res, orig, osiz);
2935 free(orig);
2936 cbdatumcat(res, "\n", 1);
2937 bi = i + step;
2938 i = bi - 2;
2939 break;
2940 }
2941 }
2942 }
2943 if(i - bi > 0){
2944 orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
2945 cbdatumcat(res, orig, osiz);
2946 cbdatumcat(res, "\n", 1);
2947 free(orig);
2948 }
2949 }
2950
2951
2952 /* Check whether a string begins with a key.
2953 `string' specifies a target string whose encoding is UTF-16BE.
2954 `size' specifies the size of the target string.
2955 `key' specifies a key string whose encoding is UTF-16BE.
2956 `ksiz' specifies the size of the key string.
2957 `key' specifies the pointer
2958 The return value is the number of characters of the corresponding string, or 0 if the target
2959 string does not begin with the key. */
2960 static int est_str_fwmatch_wide(const unsigned char *str, int size,
2961 const unsigned char *key, int ksiz){
2962 int si, ki;
2963 assert(str && size >= 0 && key && ksiz >= 0);
2964 if(size < 2 || ksiz < 2 || (str[0] == 0x0 && str[1] <= 0x20)) return 0;
2965 si = 0;
2966 ki = 0;
2967 while(ki < ksiz){
2968 if(si >= size) return 0;
2969 if(str[si] == 0x0 && str[si+1] <= 0x20){
2970 si += 2;
2971 continue;
2972 }
2973 if(key[ki] == 0x0 && key[ki+1] <= 0x20){
2974 ki += 2;
2975 continue;
2976 }
2977 if(str[si] != key[ki] || str[si+1] != key[ki+1]) return 0;
2978 si += 2;
2979 ki += 2;
2980 }
2981 return si;
2982 }
2983
2984
2985 /* Open the inverted index.
2986 `name' specifies the name of a directory.
2987 `omode' specifies an open mode of Villa.
2988 `dnum' specifies the number of database files.
2989 The return value is a database object of the database. */
2990 static ESTIDX *est_idx_open(const char *name, int omode, int dnum){
2991 ESTIDX *idx;
2992 CBLIST *files;
2993 char path[ESTPATHBUFSIZ];
2994 int i;
2995 assert(name && dnum > 0);
2996 if(dnum > ESTIDXDMAX) dnum = ESTIDXDMAX;
2997 CB_MALLOC(idx, sizeof(ESTIDX));
2998 if((omode & VL_OCREAT) && !est_mkdir(name) && errno != EEXIST) return NULL;
2999 if((omode & VL_OTRUNC) && (files = cbdirlist(name)) != NULL){
3000 for(i = 0; i < CB_LISTNUM(files); i++){
3001 sprintf(path, "%s%c%s", name, ESTPATHCHR, CB_LISTVAL(files, i, NULL));
3002 unlink(path);
3003 }
3004 cblistclose(files);
3005 }
3006 for(i = 0; i < dnum; i++){
3007 sprintf(path, "%s%c%04d", name, ESTPATHCHR, i + 1);
3008 if(!(idx->dbs[i] = vlopen(path, omode, VL_CMPLEX))){
3009 while(--i >= 0){
3010 vlclose(idx->dbs[i]);
3011 }
3012 return NULL;
3013 }
3014 }
3015 idx->name = cbmemdup(name, -1);
3016 idx->omode = omode;
3017 idx->dnum = dnum;
3018 idx->cdb = idx->dbs[dnum-1];
3019 return idx;
3020 }
3021
3022
3023 /* Close the inverted index.
3024 `idx' specifies an object of the inverted index.
3025 The return value is true if success, else it is false. */
3026 static int est_idx_close(ESTIDX *idx){
3027 int i, err;
3028 assert(idx);
3029 err = FALSE;
3030 for(i = 0; i < idx->dnum; i++){
3031 if(!vlclose(idx->dbs[i])) err = TRUE;
3032 }
3033 free(idx->name);
3034 free(idx);
3035 return err ? FALSE : TRUE;
3036 }
3037
3038
3039 /* Set the tuning parameters of the inverted index.
3040 `idx' specifies an object of the inverted index.
3041 Other parameters are same with `vlsettuning' of Villa. */
3042 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum){
3043 int i;
3044 assert(idx);
3045 for(i = 0; i < idx->dnum; i++){
3046 vlsettuning(idx->dbs[i], lrecmax, nidxmax, lcnum, ncnum);
3047 }
3048 }
3049
3050
3051 /* Increment the inverted index.
3052 `idx' specifies an object of the inverted index. */
3053 static void est_idx_increment(ESTIDX *idx){
3054 char path[ESTPATHBUFSIZ];
3055 if(idx->dnum >= ESTIDXDMAX){
3056 est_idx_set_current(idx);
3057 return;
3058 }
3059 sprintf(path, "%s%c%04d", idx->name, ESTPATHCHR, idx->dnum + 1);
3060 if((idx->dbs[idx->dnum] = vlopen(path, idx->omode | VL_OCREAT | VL_OTRUNC, VL_CMPLEX)) != NULL){
3061 idx->cdb = idx->dbs[idx->dnum];
3062 idx->dnum++;
3063 }
3064 }
3065
3066
3067 /* Add a record to the inverted index.
3068 `idx' specifies an object of the inverted index.
3069 `word' specifies a word.
3070 `vbuf' specifies the pointer to the value of a record.
3071 `vsiz' specifies the size of the value.
3072 The return value is true if success, else it is false. */
3073 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz){
3074 assert(idx && word && wsiz >= 0 && vbuf && vsiz >= 0);
3075 return vlput(idx->cdb, word, wsiz, vbuf, vsiz, VL_DDUP);
3076 }
3077
3078
3079 /* Remove a record from the inverted index.
3080 `idx' specifies an object of the inverted index.
3081 `word' specifies a word.
3082 `wsiz' specifies the size of the word.
3083 The return value is true if success, else it is false. Even if no item correspongs, it is
3084 success. */
3085 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz){
3086 int i, err;
3087 assert(idx && word && wsiz >= 0);
3088 err = FALSE;
3089 for(i = 0; i < idx->dnum; i++){
3090 if(!vloutlist(idx->dbs[i], word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
3091 }
3092 return err ? FALSE : TRUE;
3093 }
3094
3095
3096 /* Get a record from the inverted index.
3097 `idx' specifies an object of the inverted index.
3098 `word' specifies a word.
3099 `wsiz' specifies the size of the word.
3100 `sp' specifies the pointer to a variable to which the size of the region of the return value
3101 is assigned.
3102 The return value is the pointer to the region of the value of the corresponding record.
3103 if no item correspongs, empty region is returned. */
3104 static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp){
3105 CBDATUM *datum;
3106 char *vbuf;
3107 int i, vsiz;
3108 assert(idx && word && wsiz >= 0 && sp);
3109 datum = cbdatumopen("", 0);
3110 for(i = 0; i < idx->dnum; i++){
3111 if(!(vbuf = vlgetcat(idx->dbs[i], word, wsiz, &vsiz))) continue;
3112 cbdatumcat(datum, vbuf, vsiz);
3113 free(vbuf);
3114 }
3115 return cbdatumtomalloc(datum, sp);
3116 }
3117
3118
3119 /* Get the size of the value of a record in the inverted index.
3120 `idx' specifies an object of the inverted index.
3121 `word' specifies a word.
3122 `wsiz' specifies the size of the word.
3123 The return value is the size of the value of the corresponding record.
3124 if no item correspongs, 0 is returned. */
3125 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz){
3126 char *vbuf;
3127 int i, sum, vsiz;
3128 assert(idx && word && wsiz >= 0);
3129 sum = 0;
3130 for(i = 0; i < idx->dnum; i++){
3131 if(!(vbuf = vlgetcat(idx->dbs[i], word, wsiz, &vsiz))) continue;
3132 sum += vsiz;
3133 free(vbuf);
3134 }
3135 return sum;
3136 }
3137
3138
3139 /* Get the number of division of the inverted index.
3140 `idx' specifies an object of the inverted index.
3141 The return value is the number of division of the inverted index. */
3142 static int est_idx_num(ESTIDX *idx){
3143 assert(idx);
3144 return idx->dnum;
3145 }
3146
3147
3148 /* Get the size of the inverted index.
3149 `idx' specifies an object of the inverted index.
3150 The return value is the size of the inverted index. */
3151 static int est_idx_size(ESTIDX *idx){
3152 int i, size;
3153 assert(idx);
3154 size = 0;
3155 for(i = 0; i < idx->dnum; i++){
3156 size += vlfsiz(idx->dbs[i]);
3157 }
3158 return size;
3159 }
3160
3161
3162 /* Syncronize the inverted index.
3163 `idx' specifies an object of the inverted index.
3164 The return value is the size of the inverted index. */
3165 static int est_idx_sync(ESTIDX *idx){
3166 int i;
3167 assert(idx);
3168 for(i = 0; i < idx->dnum; i++){
3169 if(!vlsync(idx->dbs[i])) return FALSE;
3170 }
3171 return TRUE;
3172 }
3173
3174
3175 /* Optimize the inverted index.
3176 `idx' specifies an object of the inverted index.
3177 The return value is the size of the inverted index. */
3178 static int est_idx_optimize(ESTIDX *idx){
3179 int i;
3180 assert(idx);
3181 for(i = 0; i < idx->dnum; i++){
3182 if(!vloptimize(idx->dbs[i])) return FALSE;
3183 }
3184 return TRUE;
3185 }
3186
3187
3188 /* Set the current database to the smallest one in the inverted index.
3189 `idx' specifies an object of the inverted index. */
3190 static void est_idx_set_current(ESTIDX *idx){
3191 int i, size, min;
3192 assert(idx);
3193 min = vlfsiz(idx->cdb);
3194 for(i = 0; i < idx->dnum; i++){
3195 if((size = vlfsiz(idx->dbs[i])) < min){
3196 idx->cdb = idx->dbs[i];
3197 min = size;
3198 }
3199 }
3200 }
3201
3202
3203 /* Write meta data to the database.
3204 `db' specifies a database object.
3205 The return value is true if success, else it is false. */
3206 static int est_db_write_meta(ESTDB *db){
3207 char vbuf[ESTNUMBUFSIZ], *sbuf;
3208 int err, ssiz;
3209 assert(db);
3210 err = FALSE;
3211 sprintf(vbuf, "%d", est_idx_num(db->idxdb));
3212 if(!dpput(db->metadb, ESTKEYIDXNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3213 sprintf(vbuf, "%d", db->dseq);
3214 if(!dpput(db->metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3215 sprintf(vbuf, "%d", db->dnum);
3216 if(!dpput(db->metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3217 sprintf(vbuf, "%d", db->amode);
3218 if(!dpput(db->metadb, ESTKEYAMODE, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3219 if(db->metacc){
3220 sbuf = cbmapdump(db->metacc, &ssiz);
3221 if(!dpput(db->metadb, ESTKEYMETA, -1, sbuf, ssiz, DP_DOVER)) err = TRUE;
3222 free(sbuf);
3223 }
3224 if(err){
3225 db->ecode = ESTEDB;
3226 db->fatal = TRUE;
3227 }
3228 return err ? FALSE : TRUE;
3229 }
3230
3231
3232 /* Call the callback function of a database.
3233 `db' specifies a database object.
3234 `info' specifies an extra message. */
3235 static void est_db_inform(ESTDB *db, const char *info){
3236 char *msg;
3237 assert(db);
3238 if(!db->cbinfo) return;
3239 msg = cbsprintf("%s: name=%s dnum=%d wnum=%d fsiz=%.0f crnum=%d csiz=%.0f",
3240 info, db->name, db->dnum, vlrnum(db->fwmdb), (double)est_db_size(db),
3241 cbmaprnum(db->idxcc), (double)est_db_used_cache_size(db));
3242 db->cbinfo(msg);
3243 free(msg);
3244 }
3245
3246
3247 /* Get the size of used cache region.
3248 `db' specifies a database object.
3249 The return value is the size of used cache region. */
3250 static int est_db_used_cache_size(ESTDB *db){
3251 assert(db);
3252 return (db->icsiz + cbmaprnum(db->idxcc) * (sizeof(CBMAPDATUM) + ESTWORDAVGLEN)) * ESTMEMIRATIO;
3253 }
3254
3255
3256 /* Prepare cache for meta data.
3257 `db' specifies a database object. */
3258 static void est_db_prepare_meta(ESTDB *db){
3259 char *sbuf;
3260 int ssiz;
3261 assert(db);
3262 if((sbuf = dpget(db->metadb, ESTKEYMETA, -1, 0, -1, &ssiz)) != NULL){
3263 db->metacc = cbmapload(sbuf, ssiz);
3264 free(sbuf);
3265 } else {
3266 db->metacc = cbmapopenex(ESTMINIBNUM);
3267 }
3268 }
3269
3270
3271 /* Create a list of terms for search.
3272 `phrase' specifies a search phrase.
3273 The return value is a list object of the terms of the phrase. */
3274 static CBLIST *est_phrase_terms(const char *phrase){
3275 CBLIST *terms, *elems;
3276 CBDATUM *datum;
3277 const char *elem;
3278 char *tbuf, *pbuf;
3279 int i, tsiz, psiz, lw;
3280 assert(phrase);
3281 terms = cblistopen();
3282 tbuf = est_uconv_in(phrase, strlen(phrase), &tsiz);
3283 est_normalize_text((unsigned char *)tbuf, tsiz, &tsiz);
3284 pbuf = est_uconv_out(tbuf, tsiz, &psiz);
3285 elems = cbsplit(pbuf, psiz, "\a\b\t\n\v\f\r ");
3286 datum = cbdatumopen("", 0);
3287 lw = FALSE;
3288 for(i = 0; i < CB_LISTNUM(elems); i++){
3289 elem = CB_LISTVAL(elems, i, NULL);
3290 if(elem[0] == '\0') continue;
3291 if(!strcmp(elem, ESTOPUNION)){
3292 if(CB_DATUMSIZE(datum) < 1) continue;
3293 if(lw) cbdatumcat(datum, "\t", -1);
3294 lw = FALSE;
3295 } else if(!strcmp(elem, ESTOPISECT) || !strcmp(elem, ESTOPDIFF)){
3296 if(CB_DATUMSIZE(datum) < 1) continue;
3297 cblistpush(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
3298 cbdatumsetsize(datum, 0);
3299 cblistpush(terms, elem, -1);
3300 lw = FALSE;
3301 } else {
3302 if(CB_DATUMSIZE(datum) > 0 && lw) cbdatumcat(datum, " ", 1);
3303 cbdatumcat(datum, elem, -1);
3304 lw = TRUE;
3305 }
3306 }
3307 if(CB_DATUMSIZE(datum) > 0) cblistpush(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
3308 cbdatumclose(datum);
3309 cblistclose(elems);
3310 free(pbuf);
3311 free(tbuf);
3312 for(i = 0; i < CB_LISTNUM(terms); i++){
3313 elem = CB_LISTVAL(terms, i, NULL);
3314 if(!strcmp(elem, ESTOPUVSET) || !strcmp(elem, ESTOPISECT) ||
3315 !strcmp(elem, ESTOPDIFF)) continue;
3316 tbuf = est_uconv_in(elem, strlen(elem), &tsiz);
3317 est_canonicalize_text((unsigned char *)tbuf, tsiz, TRUE);
3318 pbuf = est_uconv_out(tbuf, tsiz, &psiz);
3319 cbstrtrim(pbuf);
3320 cblistover(terms, i, pbuf, -1);
3321 free(pbuf);
3322 free(tbuf);
3323 }
3324 for(i = CB_LISTNUM(terms) - 1; i >= 0; i--){
3325 elem = CB_LISTVAL(terms, i, NULL);
3326 if(strcmp(elem, ESTOPISECT) && strcmp(elem, ESTOPDIFF)) break;
3327 free(cblistpop(terms, NULL));
3328 }
3329 return terms;
3330 }
3331
3332
3333 /* Compare two scores by each ID.
3334 `ap' specifies the pointer to one score.
3335 `bp' specifies the pointer to the other score.
3336 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3337 static int est_score_compare_by_id(const void *ap, const void *bp){
3338 assert(ap && bp);
3339 return ((ESTSCORE *)ap)->id - ((ESTSCORE *)bp)->id;
3340 }
3341
3342
3343 /* Compare two scores by each score point.
3344 `ap' specifies the pointer to one score.
3345 `bp' specifies the pointer to the other score.
3346 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3347 static int est_score_compare_by_score(const void *ap, const void *bp){
3348 assert(ap && bp);
3349 return ((ESTSCORE *)bp)->score - ((ESTSCORE *)ap)->score;
3350 }
3351
3352
3353 /* Compare two scores by attributes of strings for ascending order.
3354 `ap' specifies the pointer to one score.
3355 `bp' specifies the pointer to the other score.
3356 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3357 static int est_score_compare_by_str_asc(const void *ap, const void *bp){
3358 assert(ap && bp);
3359 return strcmp(((ESTSCORE *)ap)->value, ((ESTSCORE *)bp)->value);
3360 }
3361
3362
3363 /* Compare two scores by attributes of strings for descending order.
3364 `ap' specifies the pointer to one score.
3365 `bp' specifies the pointer to the other score.
3366 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3367 static int est_score_compare_by_str_desc(const void *ap, const void *bp){
3368 assert(ap && bp);
3369 return strcmp(((ESTSCORE *)bp)->value, ((ESTSCORE *)ap)->value);
3370 }
3371
3372
3373 /* Compare two scores by attributes of numbers for ascending order.
3374 `ap' specifies the pointer to one score.
3375 `bp' specifies the pointer to the other score.
3376 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3377 static int est_score_compare_by_num_asc(const void *ap, const void *bp){
3378 assert(ap && bp);
3379 return (time_t)((ESTSCORE *)ap)->value - (time_t)((ESTSCORE *)bp)->value;
3380 }
3381
3382
3383 /* Compare two scores by attributes of numbers for descending order.
3384 `ap' specifies the pointer to one score.
3385 `bp' specifies the pointer to the other score.
3386 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3387 static int est_score_compare_by_num_desc(const void *ap, const void *bp){
3388 assert(ap && bp);
3389 return (time_t)((ESTSCORE *)bp)->value - (time_t)((ESTSCORE *)ap)->value;
3390 }
3391
3392
3393 /* Get the universal set of documents in a database.
3394 `db' specifies a database object.
3395 `nump' specifies the pointer to which the number of elements in the result is assigned.
3396 `hints' specifies a list object. If it is `NULL', it is not used.
3397 `add' specifies whether the result to be treated in union or difference.
3398 The return value is an array whose elements are ID numbers of corresponding documents. */
3399 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add){
3400 ESTSCORE *scores;
3401 char *vbuf, numbuf[ESTNUMBUFSIZ];
3402 int snum, smax;
3403 assert(db && nump);
3404 smax = ESTALLOCUNIT;
3405 CB_MALLOC(scores, smax * sizeof(ESTSCORE));
3406 snum = 0;
3407 vlcurfirst(db->listdb);
3408 while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
3409 if(snum >= smax){
3410 smax *= 2;
3411 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
3412 }
3413 scores[snum].id = atoi(vbuf);
3414 scores[snum].score = 0;
3415 snum++;
3416 free(vbuf);
3417 vlcurnext(db->listdb);
3418 }
3419 *nump = snum;
3420 if(hints){
3421 sprintf(numbuf, "%d", snum * (add ? 1 : -1));
3422 cbmapput(hints, ESTOPUVSET, -1, numbuf, -1, FALSE);
3423 }
3424 return scores;
3425 }
3426
3427
3428 /* Expand a word to words which begins with it.
3429 `db' specifies a database object.
3430 `word' specifies a word.
3431 `list' specifies a list object to contain the results. */
3432 static void est_expand_word(ESTDB *db, const char *word, CBLIST *list){
3433 char *kbuf;
3434 int ksiz;
3435 assert(db && word && list);
3436 vlcurjump(db->fwmdb, word, -1, VL_JFORWARD);
3437 while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
3438 if(!cbstrfwmatch(kbuf, word)){
3439 free(kbuf);
3440 break;
3441 }
3442 cblistpushbuf(list, kbuf, ksiz);
3443 vlcurnext(db->fwmdb);
3444 }
3445 }
3446
3447
3448 /* Get a correspinding set of documents in a database.
3449 `db' specifies a database object.
3450 `term' specifies a union term.
3451 `gstep' specifies number of steps of N-gram.
3452 `nump' specifies the pointer to which the number of elements in the result is assigned.
3453 `hints' specifies a list object. If it is `NULL', it is not used.
3454 `add' specifies whether the result to be treated in union or difference.
3455 The return value is an array whose elements are ID numbers of corresponding documents. */
3456 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
3457 int *nump, CBMAP *hints, int add){
3458 ESTSCORE *scores, *tscores;
3459 CBLIST *words, *grams;
3460 const char *word, *gram, *rp, *fnext, *snext, *cbuf;
3461 char *vbuf, numbuf[ESTNUMBUFSIZ];
3462 int i, j, k, snum, smax, single, tsmax, tsnum, vsiz, gcnum, gsiz, csiz, wgstep, nnum;
3463 int mfsiz, mssiz, mfhash, mshash, tfhash, tshash, id, score, hit, hnum;
3464 assert(db && term && gstep > 0 && nump);
3465 smax = ESTALLOCUNIT;
3466 CB_MALLOC(scores, smax * sizeof(ESTSCORE));
3467 snum = 0;
3468 words = cbsplit(term, -1, "\t");
3469 for(i = 0; i < CB_LISTNUM(words); i++){
3470 word = CB_LISTVAL(words, i, NULL);
3471 grams = cblistopen();
3472 switch(db->amode){
3473 case ESTAMPERFNG:
3474 est_break_text_perfng(word, grams, TRUE, FALSE);
3475 break;
3476 default:
3477 est_break_text(word, grams, TRUE, FALSE);
3478 break;
3479 }
3480 single = FALSE;
3481 if(CB_LISTNUM(grams) < 1){
3482 est_expand_word(db, word, grams);
3483 single = TRUE;
3484 }
3485 tsmax = ESTALLOCUNIT;
3486 CB_MALLOC(tscores, tsmax * sizeof(ESTSCORE));
3487 tsnum = 0;
3488 gcnum = 0;
3489 wgstep = CB_LISTNUM(grams) > 2 || gstep > 2 ? gstep : 1;
3490 if(((unsigned char *)word)[0] <= 0xdf && gstep <= 2) wgstep = 1;
3491 for(j = 0; j < CB_LISTNUM(grams); j += wgstep){
3492 gcnum++;
3493 gram = CB_LISTVAL2(grams, j, &gsiz);
3494 fnext = j < CB_LISTNUM(grams) - 1 ? CB_LISTVAL2(grams, j + 1, &mfsiz) : NULL;
3495 snext = j < CB_LISTNUM(grams) - 2 ? CB_LISTVAL2(grams, j + 2, &mssiz) : NULL;
3496 mfhash = fnext ? dpinnerhash(fnext, mfsiz) % ESTJHASHNUM + 1: 0xff;
3497 mshash = snext ? dpouterhash(snext, mssiz) % ESTJHASHNUM + 1: 0xff;
3498 vbuf = est_idx_get(db->idxdb, gram, gsiz, &vsiz);
3499 if((cbuf = cbmapget(db->idxcc, gram, gsiz, &csiz)) != NULL){
3500 if(vbuf){
3501 CB_REALLOC(vbuf, vsiz + csiz + 100);
3502 memcpy(vbuf + vsiz, cbuf, csiz);
3503 vsiz += csiz;
3504 } else {
3505 vbuf = cbmemdup(cbuf, csiz);
3506 vsiz = csiz;
3507 }
3508 }
3509 if(!vbuf) continue;
3510 rp = vbuf;
3511 while(rp < vbuf + vsiz){
3512 memcpy(&id, rp, sizeof(int));
3513 rp += sizeof(int);
3514 score = *(unsigned char *)rp;
3515 rp++;
3516 hit = mfhash == 0xff && mshash == 0xff;
3517 while(rp < vbuf + vsiz){
3518 tfhash = *(unsigned char *)rp;
3519 rp++;
3520 tshash = *(unsigned char *)rp;
3521 rp++;
3522 if((mfhash == 0xff || mfhash == tfhash) && (mshash == 0xff || mshash == tshash))
3523 hit = TRUE;
3524 if(*(unsigned char *)rp == 0x00){
3525 rp++;
3526 break;
3527 }
3528 }
3529 if(hit || single){
3530 if(tsnum >= tsmax){
3531 tsmax *= 2;
3532 CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
3533 }
3534 tscores[tsnum].id = id;
3535 tscores[tsnum].score = score * 100;
3536 tsnum++;
3537 }
3538 }
3539 free(vbuf);
3540 }
3541 if(gcnum > 1){
3542 qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_id);
3543 nnum = 0;
3544 for(j = 0; j < tsnum; j++){
3545 id = tscores[j].id;
3546 score = tscores[j].score;
3547 hnum = 1;
3548 for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
3549 score += tscores[k].score;
3550 hnum++;
3551 }
3552 if(hnum >= gcnum || single){
3553 tscores[nnum].id = id;
3554 tscores[nnum].score = score / hnum;
3555 nnum++;
3556 }
3557 j = k - 1;
3558 }
3559 tsnum = nnum;
3560 }
3561 if(hints){
3562 sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
3563 cbmapput(hints, word, -1, numbuf, -1, FALSE);
3564 }
3565 for(j = 0; j < tsnum; j++){
3566 if(snum >= smax){
3567 smax *= 2;
3568 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
3569 }
3570 scores[snum].id = tscores[j].id;
3571 scores[snum].score = tscores[j].score;
3572 snum++;
3573 }
3574 free(tscores);
3575 cblistclose(grams);
3576 }
3577 cblistclose(words);
3578 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
3579 nnum = 0;
3580 for(i = 0; i < snum; i++){
3581 id = scores[i].id;
3582 score = scores[i].score;
3583 hnum = 1;
3584 for(j = i + 1; j < snum && scores[j].id == id; j++){
3585 score += scores[j].score;
3586 hnum++;
3587 }
3588 scores[nnum].id = id;
3589 scores[nnum].score = score / hnum;
3590 nnum++;
3591 i = j - 1;
3592 }
3593 *nump = nnum;
3594 return scores;
3595 }
3596
3597
3598 /* Narrow and sort scores of search candidates.
3599 `db' specifies a database object.
3600 `attrs' specifies a list object of narrowing attributes.
3601 `order' specifies an expression for sorting.
3602 `scores' specifies an array of scores of search candidates.
3603 `snum' specifies the number of the array.
3604 The return value is the new number of the array. */
3605 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
3606 ESTSCORE *scores, int snum){
3607 ESTCATTR *list;
3608 const char *otype, *cbuf, *rp, *pv, *ibuf;
3609 unsigned char *utmp;
3610 char *oname, *wp, *mbuf, *vbuf;
3611 int i, j, k, ci, oi, anum, tsiz, nnum, csiz, msiz, miss, vsiz, num, isiz, onlen;
3612 time_t tval;
3613 assert(db && scores && snum >= 0);
3614 ci = -1;
3615 oi = -1;
3616 oname = NULL;
3617 otype = NULL;
3618 if(order){
3619 oname = cbmemdup(order, -1);
3620 cbstrtrim(oname);
3621 otype = ESTORDSTRA;
3622 if((wp = strchr(oname, ' ')) != NULL){
3623 *wp = '\0';
3624 rp = wp + 1;
3625 while(*rp == ' '){
3626 rp++;
3627 }
3628 otype = rp;
3629 }
3630 }
3631 if(attrs){
3632 anum = CB_LISTNUM(attrs);
3633 CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
3634 for(i = 0; i < anum; i++){
3635 list[i].name = NULL;
3636 list[i].oper = NULL;
3637 list[i].val = NULL;
3638 rp = CB_LISTVAL(attrs, i, NULL);
3639 while(*rp > 0 && *rp <= ' '){
3640 rp++;
3641 }
3642 if((pv = strchr(rp, ' ')) != NULL){
3643 list[i].nsiz = pv - rp;
3644 list[i].name = cbmemdup(rp, list[i].nsiz);
3645 rp = pv;
3646 while(*rp > 0 && *rp <= ' '){
3647 rp++;
3648 }
3649 if((pv = strchr(rp, ' ')) != NULL){
3650 list[i].oper = cbmemdup(rp, pv - rp);
3651 rp = pv;
3652 while(*rp > 0 && *rp <= ' '){
3653 rp++;
3654 }
3655 list[i].vsiz = strlen(rp);
3656 list[i].val = cbmemdup(rp, list[i].vsiz);
3657 } else {
3658 list[i].oper = cbmemdup(rp, -1);
3659 }
3660 } else {
3661 list[i].nsiz = strlen(rp);
3662 list[i].name = cbmemdup(rp, list[i].nsiz);
3663 }
3664 if(!list[i].oper){
3665 list[i].oper = cbmemdup("", 0);
3666 }
3667 if(!list[i].val){
3668 list[i].vsiz = 0;
3669 list[i].val = cbmemdup("", 0);
3670 }
3671 }
3672 for(i = 0; i < anum; i++){
3673 rp = list[i].oper;
3674 if(*rp == '!'){
3675 list[i].sign = FALSE;
3676 rp++;
3677 } else {
3678 list[i].sign = TRUE;
3679 }
3680 if(*rp == 'I' || *rp == 'i'){
3681 utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
3682 est_normalize_text(utmp, tsiz, &tsiz);
3683 est_canonicalize_text(utmp, tsiz, FALSE);
3684 list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
3685 free(utmp);
3686 rp++;
3687 } else {
3688 list[i].sval = NULL;
3689 list[i].ssiz = 0;
3690 }
3691 list[i].num = cbstrmktime(list[i].val);
3692 if(!cbstricmp(rp, ESTOPSTREQ)){
3693 list[i].cop = ESTOPSTREQ;
3694 } else if(!cbstricmp(rp, ESTOPSTRNE)){
3695 list[i].cop = ESTOPSTRNE;
3696 } else if(!cbstricmp(rp, ESTOPSTRINC)){
3697 list[i].cop = ESTOPSTRINC;
3698 } else if(!cbstricmp(rp, ESTOPSTRBW)){
3699 list[i].cop = ESTOPSTRBW;
3700 } else if(!cbstricmp(rp, ESTOPSTREW)){
3701 list[i].cop = ESTOPSTREW;
3702 } else if(!cbstricmp(rp, ESTOPNUMEQ)){
3703 list[i].cop = ESTOPNUMEQ;
3704 } else if(!cbstricmp(rp, ESTOPNUMNE)){
3705 list[i].cop = ESTOPNUMNE;
3706 } else if(!cbstricmp(rp, ESTOPNUMGT)){
3707 list[i].cop = ESTOPNUMGT;
3708 } else if(!cbstricmp(rp, ESTOPNUMGE)){
3709 list[i].cop = ESTOPNUMGE;
3710 } else if(!cbstricmp(rp, ESTOPNUMLT)){
3711 list[i].cop = ESTOPNUMLT;
3712 } else if(!cbstricmp(rp, ESTOPNUMLE)){
3713 list[i].cop = ESTOPNUMLE;
3714 } else {
3715 list[i].cop = NULL;
3716 }
3717 }
3718 if(db->spacc){
3719 for(i = 0; i < anum; i++){
3720 if(!strcmp(list[i].name, db->scname)){
3721 ci = i;
3722 break;
3723 }
3724 }
3725 }
3726 if(oname){
3727 for(i = 0; i < anum; i++){
3728 if(!strcmp(list[i].name, oname)){
3729 oi = i;
3730 break;
3731 }
3732 }
3733 }
3734 nnum = 0;
3735 for(i = 0; i < snum; i++){
3736 scores[i].value = NULL;
3737 if(ci >= 0){
3738 if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
3739 cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
3740 } else {
3741 cbuf = NULL;
3742 csiz = 0;
3743 }
3744 mbuf = NULL;
3745 if((cbuf && anum == 1) ||
3746 (mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
3747 miss = FALSE;
3748 for(j = 0; !miss && j < anum; j++){
3749 if(list[j].nsiz < 1) continue;
3750 if(mbuf){
3751 vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
3752 } else if(csiz != 1 || cbuf[0] != '\0'){
3753 vbuf = cbmemdup(cbuf, csiz);
3754 vsiz = csiz;
3755 } else {
3756 vbuf = NULL;
3757 }
3758 if(list[j].oper[0] == '\0'){
3759 if(!vbuf) miss = TRUE;
3760 } else {
3761 if(!vbuf){
3762 vbuf = cbmemdup("", 0);
3763 vsiz = 0;
3764 }
3765 if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
3766 list[j].sval, list[j].ssiz, list[j].num))
3767 miss = TRUE;
3768 }
3769 if(j == ci && !cbuf){
3770 if(vbuf){
3771 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
3772 } else {
3773 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
3774 }
3775 if(cbmaprnum(db->spacc) > db->scmnum){
3776 num = db->scmnum * 0.1 + 1;
3777 cbmapiterinit(db->spacc);
3778 for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
3779 cbmapout(db->spacc, ibuf, isiz);
3780 }
3781 }
3782 }
3783 if(j == oi){
3784 scores[i].value = vbuf;
3785 } else {
3786 free(vbuf);
3787 }
3788 }
3789 if(miss){
3790 free(scores[i].value);
3791 } else {
3792 scores[nnum++] = scores[i];
3793 }
3794 }
3795 free(mbuf);
3796 }
3797 snum = nnum;
3798 for(i = 0; i < anum; i++){
3799 free(list[i].sval);
3800 free(list[i].val);
3801 free(list[i].oper);
3802 free(list[i].name);
3803 }
3804 free(list);
3805 } else {
3806 for(i = 0; i < snum; i++){
3807 scores[i].value = NULL;
3808 }
3809 }
3810 if(oname){
3811 ci = db->spacc && !strcmp(oname, db->scname);
3812 onlen = strlen(oname);
3813 for(i = 0; i < snum; i++){
3814 if(scores[i].value) continue;
3815 if(ci && (cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
3816 cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
3817 if(csiz == 1 && cbuf[0] == '\0'){
3818 scores[i].value = cbmemdup("", 0);
3819 } else {
3820 scores[i].value = cbmemdup(cbuf, csiz);
3821 }
3822 continue;
3823 }
3824 if((mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
3825 if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
3826 if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
3827 scores[i].value = vbuf;
3828 } else {
3829 if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
3830 scores[i].value = cbmemdup("", 0);
3831 }
3832 if(ci && cbmaprnum(db->spacc) > db->scmnum){
3833 num = db->scmnum * 0.1 + 1;
3834 cbmapiterinit(db->spacc);
3835 for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
3836 cbmapout(db->spacc, ibuf, isiz);
3837 }
3838 }
3839 free(mbuf);
3840 } else {
3841 scores[i].value = cbmemdup("", 0);
3842 }
3843 }
3844 if(!cbstricmp(otype, ESTORDSTRA)){
3845 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
3846 } else if(!cbstricmp(otype, ESTORDSTRD)){
3847 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
3848 } else if(!cbstricmp(otype, ESTORDNUMA)){
3849 for(i = 0; i < snum; i++){
3850 tval = cbstrmktime(scores[i].value);
3851 free(scores[i].value);
3852 scores[i].value = (void *)tval;
3853 }
3854 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
3855 for(i = 0; i < snum; i++){
3856 scores[i].value = NULL;
3857 }
3858 } else if(!cbstricmp(otype, ESTORDNUMD)){
3859 for(i = 0; i < snum; i++){
3860 tval = cbstrmktime(scores[i].value);
3861 free(scores[i].value);
3862 scores[i].value = (void *)tval;
3863 }
3864 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
3865 for(i = 0; i < snum; i++){
3866 scores[i].value = NULL;
3867 }
3868 }
3869 for(i = 0; i < snum; i++){
3870 free(scores[i].value);
3871 }
3872 free(oname);
3873 }
3874 return snum;
3875 }
3876
3877
3878 /* Check whether a score matches an attribute condition.
3879 `tval' specifies the target value;
3880 `tsiz' specifies the size of the target value
3881 `oval' specifies the operation value;
3882 `osiz' specifies the size of the operation value
3883 `sval' specifies the operation value of small cases;
3884 `ssiz' specifies the size of the operation value of small cases.
3885 `onum' specifies the numeric value.
3886 The return value is true if it does match, else it is false. */
3887 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
3888 const char *oval, int osiz, const char *sval, int ssiz, int onum){
3889 unsigned char *eval;
3890 char *cval;
3891 int csiz, esiz, hit;
3892 assert(tval && tsiz >= 0 && oval && osiz >= 0);
3893 cval = NULL;
3894 if(sval){
3895 eval = (unsigned char *)est_uconv_in(tval, tsiz, &esiz);
3896 est_normalize_text(eval, esiz, &esiz);
3897 est_canonicalize_text(eval, esiz, FALSE);
3898 cval = (char *)est_uconv_out((char *)eval, esiz, &csiz);
3899 free(eval);
3900 tval = cval;
3901 tsiz = csiz;
3902 oval = sval;
3903 osiz = ssiz;
3904 }
3905 if(cop == ESTOPSTREQ){
3906 hit = !strcmp(tval, oval);
3907 } else if(cop == ESTOPSTRNE){
3908 hit = strcmp(tval, oval) != 0;
3909 } else if(cop == ESTOPSTRINC){
3910 hit = strstr(tval, oval) != NULL;
3911 } else if(cop == ESTOPSTRBW){
3912 hit = cbstrfwmatch(tval, oval);
3913 } else if(cop == ESTOPSTREW){
3914 hit = cbstrbwmatch(tval, oval);
3915 } else if(cop == ESTOPNUMEQ){
3916 hit = cbstrmktime(tval) == onum;
3917 } else if(cop == ESTOPNUMNE){
3918 hit = cbstrmktime(tval) != onum;
3919 } else if(cop == ESTOPNUMGT){
3920 hit = cbstrmktime(tval) > onum;
3921 } else if(cop == ESTOPNUMGE){
3922 hit = cbstrmktime(tval) >= onum;
3923 } else if(cop == ESTOPNUMLT){
3924 hit = cbstrmktime(tval) < onum;
3925 } else if(cop == ESTOPNUMLE){
3926 hit = cbstrmktime(tval) <= onum;
3927 } else {
3928 hit = FALSE;
3929 }
3930 free(cval);
3931 return sign ? hit : !hit;
3932 }
3933
3934
3935 /* Compare two keywords by scores in descending order.
3936 `ap' specifies the pointer to one keyword.
3937 `bp' specifies the pointer to the other keyword.
3938 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3939 static int est_keysc_compare(const void *ap, const void *bp){
3940 assert(ap && bp);
3941 return ((ESTKEYSC *)bp)->pt - ((ESTKEYSC *)ap)->pt;
3942 }
3943
3944
3945 /* Get a similar set of documents in a database.
3946 `db' specifies a database object.
3947 `svmap' specifies a map object of a seed vector.
3948 `nump' specifies the pointer to which the number of elements in the result is assigned.
3949 `knum' specifies the number of keywords to get candidates.
3950 `unum' specifies the number of adopted documents for a keyword.
3951 `tfidf' specifies whether to perform TF-IDF tuning.
3952 `nmin' specifies the minimum value for narrowing.
3953 The return value is an array whose elements are ID numbers of similar documents. */
3954 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
3955 int knum, int unum, int tfidf, double nmin){
3956 ESTSCORE *scores, *tscores;
3957 CBMAP *tvmap;
3958 const char *word;
3959 int i, j, vnum, snum, tmax, tsnum, nnum, lid, *svec, *tvec;
3960 double dval;
3961 assert(db && svmap && nump && knum >= 0 && unum >= 0 && nmin >= 0.0);
3962 CB_MALLOC(scores, sizeof(ESTSCORE) * unum * knum);
3963 snum = 0;
3964 if((vnum = cbmaprnum(svmap)) < 1) vnum = 1;
3965 cbmapiterinit(svmap);
3966 tmax = unum;
3967 for(i = 0; i < knum && (word = cbmapiternext(svmap, NULL)) != NULL; i++){
3968 tscores = est_search_union(db, word, 1, &tsnum, NULL, TRUE);
3969 qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score);
3970 for(j = 0; j < tmax && j < tsnum; j++){
3971 scores[snum].id = tscores[j].id;
3972 scores[snum].score = tscores[j].score;
3973 snum++;
3974 }
3975 free(tscores);
3976 tmax -= unum / knum / 1.25;
3977 }
3978 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
3979 nnum = 0;
3980 lid = -1;
3981 CB_MALLOC(svec, vnum * sizeof(int));
3982 CB_MALLOC(tvec, vnum * sizeof(int));
3983 est_set_svec(svmap, svec, vnum);
3984 for(i = 0; i < snum; i++){
3985 if(scores[i].id != lid){
3986 tvmap = NULL;
3987 if(db->cbvec) tvmap = db->cbvec(db, scores[i].id, db->vecdata);
3988 if(!tvmap) tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
3989 if(tvmap){
3990 est_set_tvec(svmap, tvmap, tvec, vnum);
3991 if((dval = est_vec_cos(svec, tvec, vnum)) >= nmin){
3992 scores[nnum].id = scores[i].id;
3993 scores[nnum].score = (int)(dval * 10000);
3994 if(scores[nnum].score == 9999) scores[nnum].score = 10000;
3995 nnum++;
3996 }
3997 cbmapclose(tvmap);
3998 }
3999 }
4000 lid = scores[i].id;
4001 }
4002 free(tvec);
4003 free(svec);
4004 snum = nnum;
4005 *nump = snum;
4006 return scores;
4007 }
4008
4009
4010 /* Create a map object of a vector for similar search from a phrase.
4011 `phrase' specifies a search phrase for similar search.
4012 The return value is a map object of the seed vector. */
4013 static CBMAP *est_phrase_vector(const char *phrase){
4014 CBMAP *svmap;
4015 CBLIST *list;
4016 const char *pv, *rp;
4017 char *utext, *rtext;
4018 int i, num, len, size;
4019 svmap = cbmapopenex(ESTMINIBNUM);
4020 list = cblistopen();
4021 while(*phrase != '\0'){
4022 if(*phrase == ESTOPWITH[0] && cbstrfwmatch(phrase, ESTOPWITH)){
4023 phrase += strlen(ESTOPWITH);
4024 pv = phrase;
4025 while(*phrase != '\0'){
4026 if(*phrase <= ' ' && cbstrfwmatch(phrase + 1, ESTOPWITH)){
4027 phrase++;
4028 break;
4029 }
4030 phrase++;
4031 }
4032 cblistpush(list, pv, phrase - pv);
4033 } else {
4034 phrase++;
4035 }
4036 }
4037 for(i = 0; i < CB_LISTNUM(list); i++){
4038 pv = CB_LISTVAL(list, i, NULL);
4039 while(*pv > '\0' && *pv <= ' '){
4040 pv++;
4041 }
4042 num = strtol(pv, (char **)&rp, 10);
4043 if(rp && (len = rp - pv) > 0 && num >= 0){
4044 utext = est_uconv_in(rp, strlen(rp), &size);
4045 est_normalize_text((unsigned char *)utext, size, &size);
4046 est_canonicalize_text((unsigned char *)utext, size, FALSE);
4047 rtext = est_uconv_out(utext, size, NULL);
4048 cbstrsqzspc(rtext);
4049 if(rtext[0] != '\0') cbmapput(svmap, rtext, -1, pv, len, FALSE);
4050 free(rtext);
4051 free(utext);
4052 }
4053 }
4054 cblistclose(list);
4055 return svmap;
4056 }
4057
4058
4059 /* Get the target vector of a document dynamically.
4060 `db' specifies a database object.
4061 `id' specifies the ID of a document.
4062 `vnum' specifies the number of dimensions of the vector.
4063 `tfidf' specifies whether to perform TF-IDF tuning.
4064 The return value is a map object of the target vector. */
4065 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf){
4066 ESTDOC *doc;
4067 CBMAP *tvmap;
4068 assert(db && id > 0);
4069 if(!(doc = est_db_get_doc(db, id, 0))) return NULL;
4070 tvmap = est_db_etch_doc(tfidf ? db : NULL, doc, vnum);
4071 est_doc_delete(doc);
4072 return tvmap;
4073 }
4074
4075
4076 /* Set a seed vector from a map object.
4077 `svmap' specifies a map object of a seed vector.
4078 `svec' specifies a vector object.
4079 `vnum' specifies the number of dimensions of the vector. */
4080 static void est_set_svec(CBMAP *svmap, int *svec, int vnum){
4081 const char *kbuf;
4082 int i, ksiz;
4083 assert(svmap && svec && vnum > 0);
4084 cbmapiterinit(svmap);
4085 for(i = 0; i < vnum; i++){
4086 if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
4087 svec[i] = atoi(cbmapget(svmap, kbuf, ksiz, NULL));
4088 } else {
4089 svec[i] = 0;
4090 }
4091 }
4092 }
4093
4094
4095 /* Set a target vector from a map object.
4096 `svmap' specifies a map object of a seed vector.
4097 `tvmap' specifies a map object of a target vector.
4098 `tvec' specifies a vector object.
4099 `vnum' specifies the number of dimensions of the vector. */
4100 static void est_set_tvec(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum){
4101 const char *kbuf, *vbuf;
4102 int i, ksiz;
4103 assert(svmap && tvmap && tvec && vnum > 0);
4104 cbmapiterinit(svmap);
4105 for(i = 0; i < vnum; i++){
4106 if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
4107 vbuf = cbmapget(tvmap, kbuf, ksiz, NULL);
4108 tvec[i] = vbuf ? atoi(vbuf) : 0;
4109 } else {
4110 tvec[i] = 0;
4111 }
4112 }
4113 }
4114
4115
4116 /* Get the absolute of a vector.
4117 `vec' specifies a vector object.
4118 `vnum' specifies the number of dimensions of the vector.
4119 The return value is the absolute of the vector. */
4120 static double est_vec_abs(const int *vec, int vnum){
4121 double rv;
4122 int i;
4123 assert(vec && vnum >= 0);
4124 rv = 0;
4125 for(i = 0; i < vnum; i++){
4126 rv += (double)vec[i] * (double)vec[i];
4127 }
4128 return sqrt(rv);
4129 }
4130
4131
4132 /* Get the inner product of two vectors.
4133 `avec' specifies a vector object.
4134 `bvec' specifies the other vector object.
4135 `vnum' specifies the number of dimensions of the vector.
4136 The return value is the inner product of two vectors. */
4137 static double est_vec_iprod(const int *avec, const int *bvec, int vnum){
4138 double rv;
4139 int i;
4140 assert(avec && bvec && vnum >= 0);
4141 rv = 0;
4142 for(i = 0; i < vnum; i++){
4143 rv += (double)avec[i] * (double)bvec[i];
4144 }
4145 return rv;
4146 }
4147
4148
4149 /* Get the cosine of the angle of two vectors.
4150 `avec' specifies a vector object.
4151 `bvec' specifies the other vector object.
4152 `vnum' specifies the number of dimensions of the vector.
4153 The return value is the cosine of the angle of two vectors. */
4154 static double est_vec_cos(const int *avec, const int *bvec, int vnum){
4155 double rv;
4156 assert(avec && bvec && vnum >= 0);
4157 rv = est_vec_iprod(avec, bvec, vnum) /
4158 ((est_vec_abs(avec, vnum) * est_vec_abs(bvec, vnum)));
4159 return rv > 0.0 ? rv : 0.0;
4160 }
4161
4162
4163 /* Close the handle to the file of random number generator. */
4164 static void est_random_fclose(void){
4165 if(est_random_ifp) fclose(est_random_ifp);
4166 }
4167
4168
4169
4170 /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26