/[hyperestraier]/trunk/estraier.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/estraier.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (hide annotations)
Fri Jul 29 21:57:20 2005 UTC (18 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 132031 byte(s)
make working copy from version 0.5.1

1 dpavlin 2 /*************************************************************************************************
2     * Implementation of the core API
3     * Copyright (C) 2004-2005 Mikio Hirabayashi
4     * This file is part of Hyper Estraier.
5     * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6     * the GNU Lesser General Public License as published by the Free Software Foundation; either
7     * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8     * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10     * License for more details.
11     * You should have received a copy of the GNU Lesser General Public License along with Hyper
12     * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13     * Boston, MA 02111-1307 USA.
14     *************************************************************************************************/
15    
16    
17     #include "estraier.h"
18     #include "myconf.h"
19    
20     #define ESTNUMBUFSIZ 32 /* size of a buffer for a number */
21     #define ESTPATHBUFSIZ 4096 /* size of a buffer for a path */
22     #define ESTIOBUFSIZ 8192 /* size of a buffer for I/O */
23     #define ESTALLOCUNIT 1024 /* unit number of memory allocation */
24     #define ESTMINIBNUM 31 /* bucket number of map for attributes */
25    
26     #define ESTMETADBNAME "_meta" /* name of the meta database */
27     #define ESTKEYIDXNUM "_idxnum" /* key for the number of inverted indexes */
28     #define ESTKEYDSEQ "_dseq" /* key for the sequence for document IDs */
29     #define ESTKEYDNUM "_dnum" /* key for the number of documents */
30     #define ESTKEYAMODE "_amode" /* key for the mode of text analyzer */
31     #define ESTKEYMETA "_meta" /* key for meta data */
32    
33     #define ESTIDXDBNAME "_idx" /* name of the inverted index */
34     #define ESTIDXDBLRM 77 /* records in a leaf node of the inverted index */
35     #define ESTIDXDBNIM 160 /* records in a non-leaf node of the inverted index */
36     #define ESTIDXDBLCN 16 /* number of leaf cache of the inverted index */
37     #define ESTIDXDBNCN 16 /* number of non-leaf cache of the inverted index */
38     #define ESTIDXDBRLCN 128 /* number of leaf cache of the index reader */
39     #define ESTIDXDBRNCN 64 /* number of non-leaf cache of the index reader */
40    
41     #define ESTFWMDBNAME "_fwm" /* name of the database for forward matching */
42     #define ESTFWMDBLRM 111 /* records in a leaf node of forward matching DB */
43     #define ESTFWMDBNIM 110 /* records in a non-leaf node of forward matching DB */
44     #define ESTFWMDBLCN 32 /* number of leaf cache of forward matching DB */
45     #define ESTFWMDBNCN 16 /* number of non-leaf cache of forward matching DB */
46    
47     #define ESTATTRDBNAME "_attr" /* name of the database for attrutes */
48     #define ESTATTRDBBNUM 122869 /* bucket number of the database for attrutes */
49     #define ESTATTRDBDNUM 3 /* division number of the database for attrutes */
50     #define ESTATTRDBALN -5 /* alignment of the database for attrutes */
51    
52     #define ESTTEXTDBNAME "_text" /* name of the database of texts */
53     #define ESTTEXTDBBNUM 30713 /* bucket number of the database for texts */
54     #define ESTTEXTDBDNUM 7 /* division number of the database for texts */
55     #define ESTTEXTDBALN -5 /* alignment of the database for texts */
56    
57     #define ESTLISTDBNAME "_list" /* name of the database of document list */
58     #define ESTLISTDBLRM 99 /* records in a leaf node of document list DB */
59     #define ESTLISTDBNIM 200 /* records in a non-leaf node of document list DB */
60     #define ESTLISTDBLCN 32 /* number of leaf cache of document list DB */
61     #define ESTLISTDBNCN 16 /* number of non-leaf cache of document list DB */
62    
63     #define ESTIDXCCBNUM 524288 /* bucket number of cache for the inverted index */
64     #define ESTIDXCCMAX (1048576*64) /* max size of the cache */
65     #define ESTOUTCCBNUM 131072 /* bucket number of cache for deleted documents */
66     #define ESTKEYCCMNUM 65536 /* bucket number of cache for keys for TF-IDF */
67     #define ESTATTRCCMNUM 8192 /* number of cache for attributes */
68     #define ESTTEXTCCMNUM 1024 /* number of cache for texts */
69     #define ESTCCCBFREQ 10000 /* frequency of callback for flushing words */
70    
71     #define ESTDIRMODE 00755 /* permission of a creating directory */
72     #define ESTICCHECKSIZ 32768 /* size of checking character code */
73     #define ESTICMISSMAX 256 /* allowance number of missing characters */
74     #define ESTICALLWRAT 0.001 /* allowance ratio of missing characters */
75     #define ESTZCOMPLEVEL 5 /* level of compression of zlib */
76     #define ESTOCPOINT 10 /* point per occurrence */
77     #define ESTJHASHNUM 251 /* hash number for a junction */
78     #define ESTWORDMAXLEN 48 /* maximum length of a word */
79     #define ESTWORDAVGLEN 8 /* average length of a word */
80     #define ESTKEYSCALW 4 /* allowance ratio of TF-IDF for keywords */
81     #define ESTMEMIRATIO 1.1 /* incremental ratio of memory allocation */
82    
83     #define ESTSMLRKNUM 16 /* number of keywords to get candidates */
84     #define ESTSMLRUNUM 1024 /* number of adopted documents for a keyword */
85     #define ESTSMLRNMIN 0.5 /* the minimum value for narrowing */
86    
87     enum { /* enumeration for character categories */
88     ESTSPACECHR, /* space characters */
89     ESTDELIMCHR, /* delimiter characters */
90     ESTWESTALPH, /* west alphabets */
91     ESTEASTALPH /* east alphabets */
92     };
93    
94     enum { /* enumeration for text analizer modes */
95     ESTAMNORMAL, /* normal */
96     ESTAMPERFNG /* perfect N-gram */
97     };
98    
99     typedef struct { /* type of structure for a hitting object */
100     int id; /* ID of a document */
101     int score; /* score tuned by TF-IDF */
102     char *value; /* value of an attribute for sorting */
103     } ESTSCORE;
104    
105     typedef struct { /* type of structure for a conditional attribute */
106     char *name; /* name */
107     int nsiz; /* size of the name */
108     char *oper; /* operator */
109     char *val; /* value */
110     int vsiz; /* size of the value */
111     const char *cop; /* canonical operator */
112     int sign; /* positive or negative */
113     char *sval; /* value of small cases */
114     int ssiz; /* size of the small value */
115     time_t num; /* numeric value */
116     } ESTCATTR;
117    
118     typedef struct { /* type of structure for a hitting object */
119     const char *word; /* face of keyword */
120     int wsiz; /* size of the keyword */
121     int pt; /* score tuned by TF-IDF */
122     } ESTKEYSC;
123    
124    
125     /* private function prototypes */
126     static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
127     static void est_normalize_text(unsigned char *utext, int size, int *sp);
128     static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
129     static int est_char_category(int c);
130     static int est_char_category_perfng(int c);
131     static char *est_phrase_from_thumb(const char *sphrase);
132     static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
133     int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
134     static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
135     const unsigned char *needle, int nsiz);
136     static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
137     static int est_idx_close(ESTIDX *idx);
138     static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum);
139     static void est_idx_increment(ESTIDX *idx);
140     static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz);
141     static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
142     static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp);
143     static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
144     static int est_idx_num(ESTIDX *idx);
145     static int est_idx_size(ESTIDX *idx);
146     static int est_idx_sync(ESTIDX *idx);
147     static int est_idx_optimize(ESTIDX *idx);
148     static void est_idx_set_current(ESTIDX *idx);
149     static int est_db_write_meta(ESTDB *db);
150     static void est_db_inform(ESTDB *db, const char *info);
151     static int est_db_used_cache_size(ESTDB *db);
152     static void est_db_prepare_meta(ESTDB *db);
153     static CBLIST *est_phrase_terms(const char *phrase);
154     static int est_score_compare_by_id(const void *ap, const void *bp);
155     static int est_score_compare_by_score(const void *ap, const void *bp);
156     static int est_score_compare_by_str_asc(const void *ap, const void *bp);
157     static int est_score_compare_by_str_desc(const void *ap, const void *bp);
158     static int est_score_compare_by_num_asc(const void *ap, const void *bp);
159     static int est_score_compare_by_num_desc(const void *ap, const void *bp);
160     static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
161     static void est_expand_word(ESTDB *db, const char *word, CBLIST *list);
162     static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
163     int *nump, CBMAP *hints, int add);
164     static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
165     ESTSCORE *scores, int snum);
166     static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
167     const char *oval, int osiz, const char *sval, int ssiz, int onum);
168     static int est_keysc_compare(const void *ap, const void *bp);
169     static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
170     int knum, int unum, int tfidf, double nmin);
171     static CBMAP *est_phrase_vector(const char *phrase);
172     static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf);
173     static void est_set_svec(CBMAP *svmap, int *svec, int vnum);
174     static void est_set_tvec(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum);
175     static double est_vec_abs(const int *vec, int vnum);
176     static double est_vec_iprod(const int *avec, const int *bvec, int vnum);
177     static double est_vec_cos(const int *avec, const int *bvec, int vnum);
178     static void est_random_fclose(void);
179    
180    
181    
182     /*************************************************************************************************
183     * common settings
184     *************************************************************************************************/
185    
186    
187     /* version of QDBM */
188     const char *est_version = _EST_VERSION;
189    
190    
191    
192     /*************************************************************************************************
193     * API for document
194     *************************************************************************************************/
195    
196    
197     /* Create a document object. */
198     ESTDOC *est_doc_new(void){
199     ESTDOC *doc;
200     CB_MALLOC(doc, sizeof(ESTDOC));
201     doc->id = -1;
202     doc->attrs = NULL;
203     doc->dtexts = NULL;
204     return doc;
205     }
206    
207    
208     /* Create a document object made from draft data. */
209     ESTDOC *est_doc_new_from_draft(const char *draft){
210     ESTDOC *doc;
211     CBLIST *lines;
212     const char *line;
213     char *pv;
214     int i;
215     assert(draft);
216     doc = est_doc_new();
217     lines = cbsplit(draft, -1, "\n");
218     for(i = 0; i < CB_LISTNUM(lines); i++){
219     line = CB_LISTVAL(lines, i, NULL);
220     while(*line > '\0' && *line <= ' '){
221     line++;
222     }
223     if(line[0] == '\0'){
224     i++;
225     break;
226     }
227     if((pv = strchr(line, '=')) != NULL){
228     *(pv++) = '\0';
229     est_doc_add_attr(doc, line, pv);
230     }
231     }
232     for(; i < CB_LISTNUM(lines); i++){
233     line = CB_LISTVAL(lines, i, NULL);
234     if(line[0] == '\t'){
235     est_doc_add_hidden_text(doc, line + 1);
236     } else {
237     est_doc_add_text(doc, line);
238     }
239     }
240     cblistclose(lines);
241     return doc;
242     }
243    
244    
245     /* Destroy a document object. */
246     void est_doc_delete(ESTDOC *doc){
247     assert(doc);
248     if(doc->dtexts) cblistclose(doc->dtexts);
249     if(doc->attrs) cbmapclose(doc->attrs);
250     free(doc);
251     }
252    
253    
254     /* Add an attribute to a document object. */
255     void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value){
256     char *rbuf, *wp;
257     assert(doc && name);
258     if(name[0] == '\0') return;
259     if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
260     if(value){
261     rbuf = cbmemdup(value, -1);
262     for(wp = rbuf; *wp != '\0'; wp++){
263     if(*wp > 0 && *wp < ' ') *wp = ' ';
264     }
265     cbstrsqzspc(rbuf);
266     cbmapputvbuf(doc->attrs, name, strlen(name), rbuf, strlen(rbuf));
267     } else {
268     cbmapout(doc->attrs, name, -1);
269     }
270     }
271    
272    
273     /* Add a sentence of text to a document object. */
274     void est_doc_add_text(ESTDOC *doc, const char *text){
275     unsigned char *utext;
276     char *rtext, *wp;
277     int size;
278     assert(doc && text);
279     while(*text > '\0' && *text <= ' '){
280     text++;
281     }
282     if(text[0] == '\0') return;
283     if(!doc->dtexts) doc->dtexts = cblistopen();
284     utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
285     est_normalize_text(utext, size, &size);
286     rtext = est_uconv_out((char *)utext, size, NULL);
287     for(wp = rtext; *wp != '\0'; wp++){
288     if(*wp > 0 && *wp < ' ') *wp = ' ';
289     }
290     cbstrsqzspc(rtext);
291     if(rtext[0] != '\0'){
292     cblistpushbuf(doc->dtexts, rtext, strlen(rtext));
293     } else {
294     free(rtext);
295     }
296     free(utext);
297     }
298    
299    
300     /* Add a hidden sentence to a document object. */
301     void est_doc_add_hidden_text(ESTDOC *doc, const char *text){
302     unsigned char *utext;
303     char *rtext, *wp;
304     int size;
305     assert(doc && text);
306     while(*text > '\0' && *text <= ' '){
307     text++;
308     }
309     if(text[0] == '\0') return;
310     utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
311     est_normalize_text(utext, size, &size);
312     rtext = est_uconv_out((char *)utext, size, NULL);
313     for(wp = rtext; *wp != '\0'; wp++){
314     if(*wp > 0 && *wp < ' ') *wp = ' ';
315     }
316     cbstrsqzspc(rtext);
317     if(rtext[0] != '\0'){
318     if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
319     if(cbmapget(doc->attrs, "", 0, NULL)) cbmapputcat(doc->attrs, "", 0, " ", 1);
320     cbmapputcat(doc->attrs, "", 0, rtext, -1);
321     }
322     free(rtext);
323     free(utext);
324     }
325    
326    
327     /* Get the ID number of a document object. */
328     int est_doc_id(ESTDOC *doc){
329     assert(doc);
330     return doc->id;
331     }
332    
333    
334     /* Get a list of attribute names of a document object. */
335     CBLIST *est_doc_attr_names(ESTDOC *doc){
336     CBLIST *names;
337     const char *kbuf;
338     int ksiz;
339     assert(doc);
340     if(!doc->attrs) return cblistopen();
341     names = cblistopen();
342     cbmapiterinit(doc->attrs);
343     while((kbuf = cbmapiternext(doc->attrs, &ksiz)) != NULL){
344     if(ksiz > 0) cblistpush(names, kbuf, ksiz);
345     }
346     cblistsort(names);
347     return names;
348     }
349    
350    
351     /* Get the value of an attribute of a document object. */
352     const char *est_doc_attr(ESTDOC *doc, const char *name){
353     assert(doc && name);
354     if(!doc->attrs || name[0] == '\0') return NULL;
355     return cbmapget(doc->attrs, name, -1, NULL);
356     }
357    
358    
359     /* Get a list of sentences of the text of a document object. */
360     const CBLIST *est_doc_texts(ESTDOC *doc){
361     assert(doc);
362     if(!doc->dtexts) doc->dtexts = cblistopen();
363     return doc->dtexts;
364     }
365    
366    
367     /* Concatenate sentences of the text of a document object. */
368     char *est_doc_cat_texts(ESTDOC *doc){
369     CBDATUM *datum;
370     const char *elem;
371     int i, size;
372     if(!doc->dtexts) return cbmemdup("", 0);
373     datum = cbdatumopen("", 0);
374     for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
375     elem = CB_LISTVAL2(doc->dtexts, i, &size);
376     if(i > 0) cbdatumcat(datum, " ", 1);
377     cbdatumcat(datum, elem, size);
378     }
379     return cbdatumtomalloc(datum, NULL);
380     }
381    
382    
383     /* Dump draft data of a document object. */
384     char *est_doc_dump_draft(ESTDOC *doc){
385     CBLIST *list;
386     CBDATUM *datum;
387     const char *kbuf, *vbuf;
388     int i, ksiz, vsiz;
389     assert(doc);
390     datum = cbdatumopen("", 0);
391     if(doc->attrs){
392     list = est_doc_attr_names(doc);
393     for(i = 0; i < CB_LISTNUM(list); i++){
394     kbuf = CB_LISTVAL2(list, i, &ksiz);
395     vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
396     cbdatumcat(datum, kbuf, ksiz);
397     cbdatumcat(datum, "=", 1);
398     cbdatumcat(datum, vbuf, vsiz);
399     cbdatumcat(datum, "\n", 1);
400     }
401     cblistclose(list);
402     }
403     cbdatumcat(datum, "\n", 1);
404     if(doc->dtexts){
405     for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
406     kbuf = CB_LISTVAL2(doc->dtexts, i, &ksiz);
407     cbdatumcat(datum, kbuf, ksiz);
408     cbdatumcat(datum, "\n", 1);
409     }
410     }
411     if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
412     cbdatumcat(datum, "\t", 1);
413     cbdatumcat(datum, vbuf, vsiz);
414     cbdatumcat(datum, "\n", 1);
415     }
416     return cbdatumtomalloc(datum, NULL);
417     }
418    
419    
420     /* Make a snippet of the body text of a document object. */
421     char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
422     CBDATUM *res, *sbuf;
423     CBMAP *counts;
424     CBLIST *rwords;
425     const char *text, *word, *cval;
426     const unsigned char *rword;
427     unsigned char *rtext, *ctext;
428     int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
429     assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
430     if(!doc->dtexts) doc->dtexts = cblistopen();
431     res = cbdatumopen("", 0);
432     rwords = cblistopen();
433     for(i = 0; i < CB_LISTNUM(words); i++){
434     word = CB_LISTVAL2(words, i, &wsiz);
435     if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
436     rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
437     est_canonicalize_text(rtext, size, TRUE);
438     cblistpushbuf(rwords, (char *)rtext, size);
439     }
440     sbuf = cbdatumopen("", 0);
441     for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
442     text = CB_LISTVAL2(doc->dtexts, i, &size);
443     if(i > 0) cbdatumcat(sbuf, " ", 1);
444     cbdatumcat(sbuf, text, size);
445     }
446     rtext = (unsigned char *)est_uconv_in(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf), &size);
447     ctext = (unsigned char *)cbmemdup((char *)rtext, size);
448     est_canonicalize_text(ctext, size, FALSE);
449     mywidth = hwidth;
450     if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
451     if(mywidth > wwidth) mywidth = wwidth;
452     for(i = 0; i < size && mywidth > 0; i += 2){
453     mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
454     }
455     awsiz = size - i;
456     if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
457     est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
458     wwidth -= hwidth;
459     bi = i + 2;
460     cbdatumcat(res, "\n", 1);
461     hwidth = 1000;
462     counts = cbmapopenex(ESTMINIBNUM);
463     for(i = bi; i < size && wwidth >= 0; i += 2){
464     for(j = 0; j < CB_LISTNUM(rwords); j++){
465     rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
466     if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
467     (!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
468     csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
469     cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
470     if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
471     cbmapclose(counts);
472     counts = cbmapopenex(ESTMINIBNUM);
473     }
474     mywidth = awidth / 2 + 1;
475     for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
476     mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
477     }
478     bi = k;
479     mywidth = awidth / 2 + 1;
480     for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
481     mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
482     }
483     if(k > size) k = size;
484     est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
485     wwidth -= awidth + rwsiz / 2;
486     bi = k + 2;
487     i = bi - 2;
488     cbdatumcat(res, "\n", 1);
489     break;
490     }
491     }
492     }
493     cbmapclose(counts);
494     free(ctext);
495     free(rtext);
496     cbdatumclose(sbuf);
497     cblistclose(rwords);
498     return cbdatumtomalloc(res, NULL);
499     }
500    
501    
502     /* Check whether the text of a document object includes every specified words. */
503     int est_doc_scan_words(ESTDOC *doc, const CBLIST *words){
504     CBLIST *rwords;
505     const unsigned char *rp, *rword;
506     const char *vbuf;
507     unsigned char *rbuf;
508     int i, j, vsiz, rsiz, rwsiz, hit;
509     assert(doc && words);
510     rwords = cblistopen();
511     for(i = 0; i < CB_LISTNUM(words); i++){
512     vbuf = CB_LISTVAL2(words, i, &vsiz);
513     if(vsiz < 1 || !strcmp(vbuf, ESTOPUVSET)) continue;
514     rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
515     est_canonicalize_text(rbuf, rsiz, TRUE);
516     cblistpushbuf(rwords, (char *)rbuf, rsiz);
517     }
518     if(doc->dtexts){
519     for(i = 0; i < CB_LISTNUM(doc->dtexts) && CB_LISTNUM(rwords) > 0; i++){
520     vbuf = CB_LISTVAL2(doc->dtexts, i, &vsiz);
521     rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
522     est_canonicalize_text(rbuf, rsiz, TRUE);
523     for(rp = rbuf; rsiz >= 0; rp += 2, rsiz -= 2){
524     for(j = 0; j < CB_LISTNUM(rwords); j++){
525     rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
526     if(est_str_fwmatch_wide(rp, rsiz, rword, rwsiz)){
527     free(cblistremove(rwords, j, NULL));
528     j--;
529     }
530     }
531     }
532     free(rbuf);
533     }
534     }
535     if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
536     rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
537     est_canonicalize_text(rbuf, rsiz, TRUE);
538     for(rp = rbuf; rsiz >= 0; rp += 2, rsiz -= 2){
539     for(i = 0; i < CB_LISTNUM(rwords); i++){
540     rword = (unsigned char *)CB_LISTVAL2(rwords, i, &rwsiz);
541     if(est_str_fwmatch_wide(rp, rsiz, rword, rwsiz)){
542     free(cblistremove(rwords, i, NULL));
543     i--;
544     }
545     }
546     }
547     free(rbuf);
548     }
549     hit = CB_LISTNUM(rwords) < 1;
550     cblistclose(rwords);
551     return hit;
552     }
553    
554    
555    
556     /*************************************************************************************************
557     * API for search conditions
558     *************************************************************************************************/
559    
560    
561     /* Create a condition object. */
562     ESTCOND *est_cond_new(void){
563     ESTCOND *cond;
564     CB_MALLOC(cond, sizeof(ESTCOND));
565     cond->phrase = NULL;
566     cond->gstep = 2;
567     cond->tfidf = TRUE;
568     cond->simple = FALSE;
569     cond->attrs = NULL;
570     cond->order = NULL;
571     cond->max = -1;
572     cond->scfb = FALSE;
573     cond->scores = NULL;
574     cond->snum = 0;
575     cond->opts = 0;
576     return cond;
577     }
578    
579    
580     /* Destroy a condition object. */
581     void est_cond_delete(ESTCOND *cond){
582     assert(cond);
583     if(cond->scores) free(cond->scores);
584     if(cond->order) free(cond->order);
585     if(cond->attrs) cblistclose(cond->attrs);
586     if(cond->phrase) free(cond->phrase);
587     free(cond);
588     }
589    
590    
591     /* Set a search phrase to a condition object. */
592     void est_cond_set_phrase(ESTCOND *cond, const char *phrase){
593     assert(cond && phrase);
594     if(cond->phrase) free(cond->phrase);
595     while(*phrase > '\0' && *phrase <= ' '){
596     phrase++;
597     }
598     cond->phrase = cbmemdup(phrase, -1);
599     }
600    
601    
602     /* Add a condition of an attribute fo a condition object. */
603     void est_cond_add_attr(ESTCOND *cond, const char *expr){
604     assert(cond && expr);
605     if(!cond->attrs) cond->attrs = cblistopen();
606     while(*expr > '\0' && *expr <= ' '){
607     expr++;
608     }
609     cblistpush(cond->attrs, expr, -1);
610     }
611    
612    
613     /* Set the order of a condition object. */
614     void est_cond_set_order(ESTCOND *cond, const char *expr){
615     assert(cond && expr);
616     if(!cond->order) free(cond->order);
617     while(*expr > '\0' && *expr <= ' '){
618     expr++;
619     }
620     cond->order = cbmemdup(expr, -1);
621     }
622    
623    
624     /* Set the maximum number of retrieval of a condition object. */
625     void est_cond_set_max(ESTCOND *cond, int max){
626     assert(cond && max >= 0);
627     cond->max = max;
628     }
629    
630    
631     /* Set options of retrieval of a condition object. */
632     void est_cond_set_options(ESTCOND *cond, int options){
633     assert(cond);
634     if(options & ESTCONDSURE) cond->gstep = 1;
635     if(options & ESTCONDUSU) cond->gstep = 2;
636     if(options & ESTCONDFAST) cond->gstep = 3;
637     if(options & ESTCONDAGIT) cond->gstep = 4;
638     if(options & ESTCONDNOIDF) cond->tfidf = FALSE;
639     if(options & ESTCONDSIMPLE) cond->simple = TRUE;
640     if(options & ESTCONDSCFB) cond->scfb = TRUE;
641     cond->opts |= options;
642     }
643    
644    
645    
646     /*************************************************************************************************
647     * API for database
648     *************************************************************************************************/
649    
650    
651     /* Get the string of an error code. */
652     const char *est_err_msg(int ecode){
653     switch(ecode){
654     case ESTENOERR: return "no error";
655     case ESTEINVAL: return "invalid argument";
656     case ESTEACCES: return "access forbidden";
657     case ESTELOCK: return "lock failure";
658     case ESTEDB: return "database problem";
659     case ESTEIO: return "I/O problem";
660     case ESTENOITEM: return "no such item";
661     default: break;
662     }
663     return "miscellaneous";
664     }
665    
666    
667     /* Open a database. */
668     ESTDB *est_db_open(const char *name, int omode, int *ecp){
669     ESTDB *db;
670     DEPOT *metadb;
671     ESTIDX *idxdb;
672     CURIA *attrdb, *textdb;
673     VILLA *fwmdb, *listdb;
674     char path[ESTPATHBUFSIZ], vbuf[ESTNUMBUFSIZ];
675     int domode, comode, vomode, idxnum, dseq, dnum, amode, vsiz;
676     assert(name && ecp);
677     *ecp = ESTENOERR;
678     if((omode & ESTDBWRITER) && (omode & ESTDBCREAT) && !est_mkdir(name)){
679     switch(errno){
680     case EACCES:
681     *ecp = ESTEACCES;
682     return NULL;
683     case EEXIST:
684     break;
685     default:
686     *ecp = ESTEIO;
687     return NULL;
688     }
689     }
690     domode = DP_OREADER;
691     comode = CR_OREADER;
692     vomode = VL_OREADER;
693     if(omode & ESTDBWRITER){
694     domode = DP_OWRITER;
695     comode = CR_OWRITER;
696     vomode = VL_OWRITER | VL_OZCOMP;
697     if(omode & ESTDBCREAT){
698     domode |= DP_OCREAT;
699     comode |= CR_OCREAT;
700     vomode |= VL_OCREAT;
701     }
702     if(omode & ESTDBTRUNC){
703     domode |= DP_OTRUNC;
704     comode |= CR_OTRUNC;
705     vomode |= VL_OTRUNC;
706     }
707     }
708     if(omode & ESTDBNOLCK){
709     domode |= DP_ONOLCK;
710     comode |= CR_ONOLCK;
711     vomode |= VL_ONOLCK;
712     }
713     if(omode & ESTDBLCKNB){
714     domode |= DP_OLCKNB;
715     comode |= CR_OLCKNB;
716     vomode |= VL_OLCKNB;
717     }
718     idxnum = 0;
719     dseq = 0;
720     dnum = 0;
721     amode = ESTAMNORMAL;
722     sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
723     if((metadb = dpopen(path, domode, ESTMINIBNUM)) != NULL){
724     if((vsiz = dpgetwb(metadb, ESTKEYIDXNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
725     vbuf[vsiz] = '\0';
726     idxnum = atoi(vbuf);
727     }
728     if((vsiz = dpgetwb(metadb, ESTKEYDSEQ, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
729     vbuf[vsiz] = '\0';
730     dseq = atoi(vbuf);
731     }
732     if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
733     vbuf[vsiz] = '\0';
734     dnum = atoi(vbuf);
735     }
736     if((vsiz = dpgetwb(metadb, ESTKEYAMODE, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
737     vbuf[vsiz] = '\0';
738     amode = atoi(vbuf);
739     } else if(omode & ESTDBPERFNG){
740     amode = ESTAMPERFNG;
741     }
742     }
743     if(!metadb){
744     *ecp = (dpecode == DP_ELOCK) ? ESTELOCK : ESTEDB;
745     return NULL;
746     }
747     if(idxnum < 1) idxnum = 1;
748     if(dseq < 0) dseq = 0;
749     if(dnum < 0) dnum = 0;
750     sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
751     idxdb = est_idx_open(path, vomode, idxnum);
752     sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
753     fwmdb = vlopen(path, vomode, VL_CMPLEX);
754     sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
755     attrdb = cropen(path, comode, ESTATTRDBBNUM, ESTATTRDBDNUM);
756     sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
757     textdb = cropen(path, comode, ESTTEXTDBBNUM, ESTTEXTDBDNUM);
758     sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
759     listdb = vlopen(path, vomode, VL_CMPLEX);
760     if(!metadb || !idxdb || !fwmdb || !attrdb ||!textdb || !listdb){
761     if(listdb) vlclose(listdb);
762     if(textdb) crclose(textdb);
763     if(attrdb) crclose(attrdb);
764     if(fwmdb) vlclose(fwmdb);
765     if(idxdb) est_idx_close(idxdb);
766     dpclose(metadb);
767     *ecp = ESTEDB;
768     return NULL;
769     }
770     if(omode & ESTDBWRITER){
771     crsetalign(attrdb, ESTATTRDBALN);
772     crsetalign(textdb, ESTTEXTDBALN);
773     est_idx_set_tuning(idxdb, ESTIDXDBLRM, ESTIDXDBNIM, ESTIDXDBLCN, ESTIDXDBNCN);
774     est_idx_set_current(idxdb);
775     vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
776     vlsettuning(listdb, ESTLISTDBLRM, ESTLISTDBNIM, ESTLISTDBLCN, ESTLISTDBNCN);
777     } else {
778     est_idx_set_tuning(idxdb, -1, -1, ESTIDXDBRLCN, ESTIDXDBRNCN);
779     vlsettuning(fwmdb, -1, -1, ESTFWMDBLCN, ESTFWMDBNCN);
780     vlsettuning(listdb, -1, -1, ESTLISTDBLCN, ESTLISTDBNCN);
781     }
782     CB_MALLOC(db, sizeof(ESTDB));
783     db->name = cbmemdup(name, -1);
784     db->metadb = metadb;
785     db->idxdb = idxdb;
786     db->fwmdb = fwmdb;
787     db->attrdb = attrdb;
788     db->textdb = textdb;
789     db->listdb = listdb;
790     db->ecode = ESTENOERR;
791     db->fatal = FALSE;
792     db->dseq = dseq;
793     db->dnum = dnum;
794     db->amode = amode;
795     if(omode & ESTDBWRITER){
796     db->idxcc = cbmapopenex(ESTIDXCCBNUM);
797     db->icsiz = 0;
798     db->icmax = ESTIDXCCMAX;
799     db->outcc = cbmapopenex(ESTOUTCCBNUM);
800     } else {
801     db->idxcc = cbmapopenex(1);
802     db->icsiz = 0;
803     db->icmax = 0;
804     db->outcc = cbmapopenex(1);
805     }
806     db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
807     db->kcmnum = ESTKEYCCMNUM;
808     db->attrcc = cbmapopenex(ESTATTRCCMNUM + 1);
809     db->acmnum = ESTATTRCCMNUM;
810     db->textcc = cbmapopenex(ESTTEXTCCMNUM + 1);
811     db->tcmnum = ESTTEXTCCMNUM;
812     db->spacc = NULL;
813     db->scmnum = 0;
814     db->scname = NULL;
815     db->cbinfo = NULL;
816     db->cbvec = NULL;
817     db->vecdata = NULL;
818     db->metacc = NULL;
819     return db;
820     }
821    
822    
823     /* Close a database. */
824     int est_db_close(ESTDB *db, int *ecp){
825     int err;
826     assert(db && ecp);
827     *ecp = ESTENOERR;
828     err = FALSE;
829     if(dpwritable(db->metadb)){
830     if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
831     }
832     est_db_inform(db, "closing");
833     if(db->metacc) cbmapclose(db->metacc);
834     if(db->spacc){
835     free(db->scname);
836     cbmapclose(db->spacc);
837     }
838     cbmapclose(db->textcc);
839     cbmapclose(db->attrcc);
840     cbmapclose(db->keycc);
841     cbmapclose(db->outcc);
842     cbmapclose(db->idxcc);
843     if(!vlclose(db->listdb)) err = TRUE;
844     if(!crclose(db->textdb)) err = TRUE;
845     if(!crclose(db->attrdb)) err = TRUE;
846     if(!vlclose(db->fwmdb)) err = TRUE;
847     if(!est_idx_close(db->idxdb)) err = TRUE;
848     if(!dpclose(db->metadb)) err = TRUE;
849     free(db->name);
850     if(db->fatal){
851     *ecp = db->ecode;
852     err = TRUE;
853     } else if(err){
854     *ecp = ESTEDB;
855     }
856     free(db);
857     return err ? FALSE : TRUE;
858     }
859    
860    
861     /* Get the last happended error code of a database. */
862     int est_db_error(ESTDB *db){
863     assert(db);
864     return db->ecode;
865     }
866    
867    
868     /* Check whether a database has a fatal error. */
869     int est_db_fatal(ESTDB *db){
870     assert(db);
871     return db->fatal;
872     }
873    
874    
875     /* Flush index words in the cache of a database. */
876     int est_db_flush(ESTDB *db, int max){
877     CBMAP *ids;
878     CBLIST *keys;
879     CBDATUM *nval;
880     const char *kbuf, *vbuf, *rp, *pv;
881     char *tbuf;
882     int i, err, ksiz, vsiz, rnum, id, tsiz;
883     assert(db);
884     if(!dpwritable(db->metadb)){
885     db->ecode = ESTEACCES;
886     return FALSE;
887     }
888     if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->outcc) < 1) return TRUE;
889     err = FALSE;
890     keys = cblistopen();
891     cbmapiterinit(db->idxcc);
892     while((kbuf = cbmapiternext(db->idxcc, &ksiz)) != NULL){
893     cblistpush(keys, kbuf, ksiz);
894     }
895     rnum = CB_LISTNUM(keys);
896     cblistsort(keys);
897     if(max > 0){
898     while(CB_LISTNUM(keys) > max){
899     free(cblistpop(keys, NULL));
900     }
901     }
902     for(i = 0; i < CB_LISTNUM(keys); i++){
903     kbuf = CB_LISTVAL2(keys, i, &ksiz);
904     vbuf = cbmapget(db->idxcc, kbuf, ksiz, &vsiz);
905     if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz)){
906     err = TRUE;
907     break;
908     }
909     cbmapout(db->idxcc, kbuf, ksiz);
910     db->icsiz -= vsiz;
911     if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing index words");
912     }
913     for(i = 0; i < CB_LISTNUM(keys); i++){
914     kbuf = CB_LISTVAL2(keys, i, &ksiz);
915     if(!vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP) && dpecode != DP_EKEEP){
916     err = TRUE;
917     break;
918     }
919     if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing fwm keys");
920     }
921     cblistclose(keys);
922     if(cbmaprnum(db->idxcc) < 1){
923     cbmapclose(db->idxcc);
924     db->idxcc = cbmapopenex(rnum > ESTIDXCCBNUM ? rnum * 1.5 : ESTIDXCCBNUM);
925     }
926     if(max < 0 && cbmaprnum(db->outcc) > 0){
927     ids = cbmapopen();
928     keys = cblistopen();
929     cbmapiterinit(db->outcc);
930     while((kbuf = cbmapiternext(db->outcc, &ksiz)) != NULL){
931     if(*kbuf == '\t'){
932     id = atoi(kbuf + 1);
933     cbmapput(ids, (char *)&id, sizeof(int), "", 0, FALSE);
934     } else {
935     cblistpush(keys, kbuf, ksiz);
936     }
937     }
938     cblistsort(keys);
939     for(i = 0; i < CB_LISTNUM(keys); i++){
940     if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
941     kbuf = CB_LISTVAL2(keys, i, &ksiz);
942     if((tbuf = est_idx_get(db->idxdb, kbuf, ksiz, &tsiz)) != NULL){
943     nval = cbdatumopen("", 0);
944     rp = tbuf;
945     while(rp < tbuf + tsiz){
946     pv = rp;
947     rp += 5;
948     while(*rp != 0x0){
949     rp += 2;
950     }
951     rp++;
952     if(!cbmapget(ids, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
953     }
954     if(!est_idx_out(db->idxdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
955     if(CB_DATUMSIZE(nval) > 0){
956     if(!est_idx_add(db->idxdb, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
957     err = TRUE;
958     } else {
959     if(!vlout(db->fwmdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
960     }
961     cbdatumclose(nval);
962     free(tbuf);
963     }
964     cbmapout(db->outcc, kbuf, ksiz);
965     if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
966     }
967     rnum = cbmaprnum(ids);
968     cblistclose(keys);
969     cbmapclose(ids);
970     cbmapclose(db->outcc);
971     db->outcc = cbmapopenex(ESTOUTCCBNUM);
972     }
973     cbmapclose(db->keycc);
974     db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
975     db->kcmnum = ESTKEYCCMNUM;
976     if(err){
977     db->ecode = ESTEDB;
978     db->fatal = TRUE;
979     return FALSE;
980     }
981     return TRUE;
982     }
983    
984    
985     /* Synchronize updating contents of a database. */
986     int est_db_sync(ESTDB *db){
987     int err;
988     assert(db);
989     if(!dpwritable(db->metadb)){
990     db->ecode = ESTEACCES;
991     return FALSE;
992     }
993     err = FALSE;
994     if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
995     est_db_inform(db, "synchronizing the database for meta information");
996     if(!dpsync(db->metadb)) err = TRUE;
997     est_db_inform(db, "synchronizing the inverted index");
998     if(!est_idx_sync(db->idxdb)) err = TRUE;
999     est_db_inform(db, "synchronizing the database for forward matching");
1000     if(!vlsync(db->fwmdb)) err = TRUE;
1001     est_db_inform(db, "synchronizing the database for attrutes");
1002     if(!crsync(db->attrdb)) err = TRUE;
1003     est_db_inform(db, "synchronizing the database for texts");
1004     if(!crsync(db->textdb)) err = TRUE;
1005     est_db_inform(db, "synchronizing the database for document list");
1006     if(!vlsync(db->listdb)) err = TRUE;
1007     if(err){
1008     db->ecode = ESTEDB;
1009     db->fatal = TRUE;
1010     }
1011     return err ? FALSE : TRUE;
1012     }
1013    
1014    
1015     /* Optimize a database. */
1016     int est_db_optimize(ESTDB *db, int options){
1017     CBMAP *dmap;
1018     CBLIST *words;
1019     CBDATUM *nval;
1020     const char *word, *rp, *pv;
1021     char *kbuf, *vbuf;
1022     int i, err, id, ksiz, vsiz, wsiz;
1023     assert(db);
1024     if(!dpwritable(db->metadb)){
1025     db->ecode = ESTEACCES;
1026     return FALSE;
1027     }
1028     if(!est_db_flush(db, -1)) return FALSE;
1029     err = FALSE;
1030     if(!(options & ESTOPTNOPURGE)){
1031     dmap = cbmapopenex(vlrnum(db->listdb) + 1);
1032     vlcurfirst(db->listdb);
1033     while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
1034     id = atoi(vbuf);
1035     cbmapput(dmap, (char *)&id, sizeof(int), "", 0, FALSE);
1036     free(vbuf);
1037     vlcurnext(db->listdb);
1038     }
1039     words = cblistopen();
1040     vlcurfirst(db->fwmdb);
1041     while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
1042     cblistpushbuf(words, kbuf, ksiz);
1043     vlcurnext(db->fwmdb);
1044     }
1045     for(i = 0; i < CB_LISTNUM(words); i++){
1046     if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
1047     word = CB_LISTVAL2(words, i, &wsiz);
1048     if((vbuf = est_idx_get(db->idxdb, word, wsiz, &vsiz)) != NULL){
1049     nval = cbdatumopen("", 0);
1050     rp = vbuf;
1051     while(rp < vbuf + vsiz){
1052     pv = rp;
1053     rp += 5;
1054     while(*rp != 0x0){
1055     rp += 2;
1056     }
1057     rp++;
1058     if(cbmapget(dmap, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
1059     }
1060     if(!est_idx_out(db->idxdb, word, wsiz)) err = TRUE;
1061     if(CB_DATUMSIZE(nval) > 0){
1062     if(!est_idx_add(db->idxdb, word, wsiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1063     err = TRUE;
1064     } else {
1065     if(!vlout(db->fwmdb, word, wsiz)) err = TRUE;
1066     }
1067     cbdatumclose(nval);
1068     free(vbuf);
1069     } else {
1070     err = TRUE;
1071     }
1072     free(kbuf);
1073     if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1074     }
1075     cblistclose(words);
1076     cbmapclose(dmap);
1077     }
1078     if(!(options & ESTOPTNODBOPT)){
1079     est_db_inform(db, "optimizing the inverted index");
1080     if(!est_idx_optimize(db->idxdb)) err = TRUE;
1081     est_db_inform(db, "optimizing the database for forward matching");
1082     if(!vloptimize(db->fwmdb)) err = TRUE;
1083     est_db_inform(db, "optimizing the database for attrutes");
1084     if(!croptimize(db->attrdb, -1)) err = TRUE;
1085     est_db_inform(db, "optimizing the database for texts");
1086     if(!croptimize(db->textdb, -1)) err = TRUE;
1087     est_db_inform(db, "optimizing the database for document list");
1088     if(!vloptimize(db->listdb)) err = TRUE;
1089     }
1090     if(err){
1091     db->ecode = ESTEDB;
1092     db->fatal = TRUE;
1093     }
1094     return err ? FALSE : TRUE;
1095     }
1096    
1097    
1098     /* Add a document to a database. */
1099     int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options){
1100     CBMAP *ocmap, *fmap, *qmap;
1101     CBLIST *words;
1102     CBDATUM *ocbuf;
1103     const char *uri, *text, *word, *fnext, *snext, *kbuf, *vbuf;
1104     unsigned char junc[2], c;
1105     char wbuf[ESTWORDMAXLEN+3], *sbuf, *zbuf, nbuf[ESTNUMBUFSIZ];
1106     int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, num, ksiz, vsiz, ssiz, zsiz;
1107     double tune;
1108     assert(db && doc);
1109     if(!dpwritable(db->metadb)){
1110     db->ecode = ESTEACCES;
1111     return FALSE;
1112     }
1113     if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
1114     db->ecode = ESTEINVAL;
1115     return FALSE;
1116     }
1117     if((id = est_db_uri_to_id(db, uri)) > 0 &&
1118     !est_db_out_doc(db, id, (options & ESTPDCLEAN) ? ESTODCLEAN : 0)) return FALSE;
1119     if(!doc->dtexts) doc->dtexts = cblistopen();
1120     doc->id = ++(db->dseq);
1121     sprintf(nbuf, "%d", doc->id);
1122     cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
1123     ocmap = cbmapopen();
1124     fmap = cbmapopen();
1125     qmap = cbmapopen();
1126     wnum = 0;
1127     for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1128     if(i < 0){
1129     if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1130     } else {
1131     text = CB_LISTVAL(doc->dtexts, i, NULL);
1132     }
1133     words = cblistopen();
1134     switch(db->amode){
1135     case ESTAMPERFNG:
1136     est_break_text_perfng(text, words, FALSE, TRUE);
1137     break;
1138     default:
1139     est_break_text(text, words, FALSE, TRUE);
1140     break;
1141     }
1142     wnum += CB_LISTNUM(words);
1143     for(j = 0; j < CB_LISTNUM(words); j++){
1144     word = CB_LISTVAL2(words, j, &wsiz);
1145     if(wsiz > ESTWORDMAXLEN) continue;
1146     fnext = cblistval(words, j + 1, &fnsiz);
1147     snext = cblistval(words, j + 2, &snsiz);
1148     junc[0] = fnext ? dpinnerhash(fnext, fnsiz) % ESTJHASHNUM + 1: 0xff;
1149     junc[1] = snext ? dpouterhash(snext, snsiz) % ESTJHASHNUM + 1: 0xff;
1150     memcpy(wbuf, word, wsiz);
1151     memcpy(wbuf + wsiz, "\t", 1);
1152     memcpy(wbuf + wsiz + 1, junc, 2);
1153     np = (int *)cbmapget(fmap, word, wsiz, NULL);
1154     num = np ? *(int *)np : 0;
1155     num += ESTOCPOINT;
1156     cbmapput(fmap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1157     if(cbmapput(qmap, wbuf, wsiz + 3, "", 0, FALSE))
1158     cbmapputcat(ocmap, word, wsiz, (char *)junc, 2);
1159     }
1160     cblistclose(words);
1161     }
1162     cbmapiterinit(ocmap);
1163     while((kbuf = cbmapiternext(ocmap, &ksiz)) != NULL){
1164     vbuf = cbmapget(ocmap, kbuf, ksiz, &vsiz);
1165     ocbuf = cbdatumopen("", 0);
1166     cbdatumcat(ocbuf, (char *)&(doc->id), sizeof(int));
1167     num = *(int *)cbmapget(fmap, kbuf, ksiz, NULL);
1168     tune = log(wnum + 3);
1169     tune = (tune * tune) / 10.0;
1170     num /= tune > 4.0 ? tune : 4.0;
1171     if(num >= 0x80) num += (0x80 - num) * 0.75;
1172     if(num >= 0xc0) num += (0xc0 - num) * 0.75;
1173     c = num < 0xff ? num : 0xff;
1174     cbdatumcat(ocbuf, (char *)&c, 1);
1175     cbdatumcat(ocbuf, vbuf, vsiz);
1176     c = 0x00;
1177     cbdatumcat(ocbuf, (char *)&c, 1);
1178     cbmapputcat(db->idxcc, kbuf, ksiz, CB_DATUMPTR(ocbuf), CB_DATUMSIZE(ocbuf));
1179     db->icsiz += CB_DATUMSIZE(ocbuf);
1180     cbdatumclose(ocbuf);
1181     }
1182     cbmapclose(qmap);
1183     cbmapclose(fmap);
1184     cbmapclose(ocmap);
1185     err = FALSE;
1186     sbuf = cbmapdump(doc->attrs, &ssiz);
1187     if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DKEEP)){
1188     db->ecode = ESTEDB;
1189     db->fatal = TRUE;
1190     err = TRUE;
1191     }
1192     free(sbuf);
1193     sbuf = cblistdump(doc->dtexts, &ssiz);
1194     if(!(zbuf = est_deflate(sbuf, ssiz, &zsiz))){
1195     CB_MALLOC(zbuf, 1);
1196     zsiz = 0;
1197     db->ecode = ESTEMISC;
1198     db->fatal = TRUE;
1199     err = TRUE;
1200     }
1201     if(!crput(db->textdb, (char *)&(doc->id), sizeof(int), zbuf, zsiz, CR_DKEEP)){
1202     db->ecode = ESTEDB;
1203     db->fatal = TRUE;
1204     err = TRUE;
1205     }
1206     free(sbuf);
1207     free(zbuf);
1208     sprintf(nbuf, "%d", doc->id);
1209     if(!vlput(db->listdb, uri, -1, nbuf, -1, VL_DKEEP)){
1210     db->ecode = ESTEDB;
1211     db->fatal = TRUE;
1212     err = TRUE;
1213     }
1214     db->dnum++;
1215     if(est_db_used_cache_size(db) > db->icmax){
1216     if(!est_db_flush(db, -1)) err = TRUE;
1217     est_idx_increment(db->idxdb);
1218     }
1219     return err ? FALSE : TRUE;
1220     }
1221    
1222    
1223     /* Remove a document from a database. */
1224     int est_db_out_doc(ESTDB *db, int id, int options){
1225     ESTDOC *doc;
1226     CBLIST *words;
1227     const char *uri, *text, *word;
1228     char numbuf[ESTNUMBUFSIZ];
1229     int i, j, len, wsiz;
1230     assert(db && id > 0);
1231     if(!dpwritable(db->metadb)){
1232     db->ecode = ESTEACCES;
1233     return FALSE;
1234     }
1235     if(!(doc = est_db_get_doc(db, id, 0))) return FALSE;
1236     if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
1237     est_doc_delete(doc);
1238     db->ecode = ESTEDB;
1239     db->fatal = TRUE;
1240     return FALSE;
1241     }
1242     if(!crout(db->attrdb, (char *)&id, sizeof(int)) ||
1243     !crout(db->textdb, (char *)&id, sizeof(int)) || !vlout(db->listdb, uri, -1)){
1244     est_doc_delete(doc);
1245     db->ecode = ESTEDB;
1246     db->fatal = TRUE;
1247     return FALSE;
1248     }
1249     cbmapout(db->attrcc, (char *)&id, sizeof(int));
1250     cbmapout(db->textcc, (char *)&id, sizeof(int));
1251     if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
1252     if((options & ESTODCLEAN) && doc->dtexts){
1253     len = sprintf(numbuf, "\t%d", doc->id);
1254     cbmapput(db->outcc, numbuf, len, "", 0, FALSE);
1255     for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1256     if(i < 0){
1257     if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1258     } else {
1259     text = CB_LISTVAL(doc->dtexts, i, NULL);
1260     }
1261     words = cblistopen();
1262     switch(db->amode){
1263     case ESTAMPERFNG:
1264     est_break_text_perfng(text, words, FALSE, TRUE);
1265     break;
1266     default:
1267     est_break_text(text, words, FALSE, TRUE);
1268     break;
1269     }
1270     for(j = 0; j < CB_LISTNUM(words); j++){
1271     word = CB_LISTVAL2(words, j, &wsiz);
1272     cbmapput(db->outcc, word, wsiz, "", 0, FALSE);
1273     }
1274     cblistclose(words);
1275     }
1276     }
1277     est_doc_delete(doc);
1278     db->dnum--;
1279     return TRUE;
1280     }
1281    
1282    
1283     /* Retrieve a document in a database. */
1284     ESTDOC *est_db_get_doc(ESTDB *db, int id, int options){
1285     ESTDOC *doc;
1286     const char *cbuf;
1287     char *vbuf, *zbuf;
1288     int i, csiz, vsiz, zsiz, num;
1289     assert(db && id > 0);
1290     cbuf = NULL;
1291     if(options & ESTGDNOATTR){
1292     if(!crvsiz(db->attrdb, (char *)&id, sizeof(int))){
1293     if(dpecode == DP_ENOITEM){
1294     db->ecode = ESTENOITEM;
1295     return NULL;
1296     } else {
1297     db->ecode = ESTEDB;
1298     db->fatal = TRUE;
1299     return NULL;
1300     }
1301     }
1302     vbuf = NULL;
1303     } else if((cbuf = cbmapget(db->attrcc, (char *)&id, sizeof(int), &csiz)) != NULL){
1304     cbmapmove(db->attrcc, (char *)&id, sizeof(int), FALSE);
1305     vbuf = NULL;
1306     } else if(!(vbuf = crget(db->attrdb, (char *)&id, sizeof(int), 0, -1, &vsiz))){
1307     if(dpecode == DP_ENOITEM){
1308     db->ecode = ESTENOITEM;
1309     return NULL;
1310     } else {
1311     db->ecode = ESTEDB;
1312     db->fatal = TRUE;
1313     return NULL;
1314     }
1315     }
1316     doc = est_doc_new();
1317     doc->id = id;
1318     if(cbuf){
1319     doc->attrs = cbmapload(cbuf, csiz);
1320     } else if(vbuf){
1321     doc->attrs = cbmapload(vbuf, vsiz);
1322     cbmapputvbuf(db->attrcc, (char *)&id, sizeof(int), vbuf, vsiz);
1323     if(cbmaprnum(db->attrcc) > db->acmnum){
1324     num = cbmaprnum(db->attrcc) * 0.1 + 1;
1325     cbmapiterinit(db->attrcc);
1326     for(i = 0; i < num && (cbuf = cbmapiternext(db->attrcc, NULL)) != NULL; i++){
1327     cbmapout(db->attrcc, cbuf, sizeof(int));
1328     }
1329     }
1330     } else {
1331     doc->attrs = NULL;
1332     }
1333     if(!(options & ESTGDNOTEXT)){
1334     if((cbuf = cbmapget(db->textcc, (char *)&id, sizeof(int), &csiz)) != NULL){
1335     cbmapmove(db->textcc, (char *)&id, sizeof(int), FALSE);
1336     doc->dtexts = cblistload(cbuf, csiz);
1337     } else {
1338     if(!(zbuf = crget(db->textdb, (char *)&id, sizeof(int), 0, -1, &zsiz))){
1339     db->ecode = ESTEDB;
1340     db->fatal = TRUE;
1341     est_doc_delete(doc);
1342     return NULL;
1343     }
1344     if(!(vbuf = est_inflate(zbuf, zsiz, &vsiz))){
1345     db->ecode = ESTEDB;
1346     db->fatal = TRUE;
1347     free(zbuf);
1348     est_doc_delete(doc);
1349     return NULL;
1350     }
1351     doc->dtexts = cblistload(vbuf, vsiz);
1352     cbmapputvbuf(db->textcc, (char *)&id, sizeof(int), vbuf, vsiz);
1353     if(cbmaprnum(db->textcc) > db->tcmnum){
1354     num = cbmaprnum(db->textcc) * 0.1 + 1;
1355     cbmapiterinit(db->textcc);
1356     for(i = 0; i < num &&(cbuf = cbmapiternext(db->textcc, NULL)) != NULL; i++){
1357     cbmapout(db->textcc, cbuf, sizeof(int));
1358     }
1359     }
1360     free(zbuf);
1361     }
1362     }
1363     return doc;
1364     }
1365    
1366    
1367     /* Retrieve the value of an attribute of a document in a database. */
1368     char *est_db_get_doc_attr(ESTDB *db, int id, const char *name){
1369     const char *cbuf;
1370     char *mbuf, *vbuf;
1371     int cb, csiz, msiz, vsiz;
1372     assert(db && id > 0 && name);
1373     cb = db->spacc && !strcmp(name, db->scname);
1374     if(cb && (cbuf = cbmapget(db->spacc, (char *)&id, sizeof(int), &csiz)) != NULL){
1375     cbmapmove(db->spacc, (char *)&id, sizeof(int), FALSE);
1376     return cbmemdup(cbuf, csiz);
1377     }
1378     if(!(mbuf = crget(db->attrdb, (char *)&id, sizeof(int), 0, -1, &msiz))){
1379     db->ecode = dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB;
1380     return NULL;
1381     }
1382     if(!(vbuf = cbmaploadone(mbuf, msiz, name, -1, &vsiz))){
1383     db->ecode = ESTENOITEM;
1384     free(mbuf);
1385     return NULL;
1386     }
1387     if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
1388     free(mbuf);
1389     return vbuf;
1390     }
1391    
1392    
1393     /* Get the ID of a document spacified by URI. */
1394     int est_db_uri_to_id(ESTDB *db, const char *uri){
1395     char *vbuf;
1396     int id;
1397     assert(db && uri);
1398     if(!(vbuf = vlget(db->listdb, uri, -1, NULL))){
1399     db->ecode = ESTENOITEM;
1400     return -1;
1401     }
1402     id = atoi(vbuf);
1403     free(vbuf);
1404     return id;
1405     }
1406    
1407    
1408     /* Extract keywords of a document object. */
1409     CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max){
1410     ESTKEYSC *scores;
1411     CBMAP *keys, *umap;
1412     CBLIST *words;
1413     const char *text, *word, *vbuf;
1414     char numbuf[ESTNUMBUFSIZ];
1415     int i, wsiz, num, smax, snum, vsiz;
1416     assert(doc && max >= 0);
1417     if(!doc->dtexts) return cbmapopenex(1);
1418     keys = cbmapopenex(max * 1.5);
1419     words = cblistopen();
1420     for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1421     if(i < 0){
1422     if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1423     } else {
1424     text = CB_LISTVAL(doc->dtexts, i, NULL);
1425     }
1426     if(db){
1427     switch(db->amode){
1428     case ESTAMPERFNG:
1429     est_break_text_perfng(text, words, FALSE, TRUE);
1430     break;
1431     default:
1432     est_break_text(text, words, FALSE, TRUE);
1433     break;
1434     }
1435     } else {
1436     est_break_text(text, words, FALSE, TRUE);
1437     }
1438     }
1439     umap = cbmapopenex(CB_LISTNUM(words) + 1);
1440     for(i = 0; i < CB_LISTNUM(words); i++){
1441     word = CB_LISTVAL2(words, i, &wsiz);
1442     if(wsiz > ESTWORDMAXLEN) continue;
1443     num = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf + 1 : 1;
1444     cbmapput(umap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1445     }
1446     smax = max * (db ? ESTKEYSCALW : 1) + 1;
1447     CB_MALLOC(scores, cbmaprnum(umap) * sizeof(ESTKEYSC) + 1);
1448     snum = 0;
1449     cbmapiterinit(umap);
1450     for(i = 0; i < smax && (word = cbmapiternext(umap, &wsiz)) != NULL; i++){
1451     scores[snum].word = word;
1452     scores[snum].wsiz = wsiz;
1453     scores[snum].pt = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf : 0;
1454     snum++;
1455     }
1456     qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
1457     if(db){
1458     for(i = 0; i < snum; i++){
1459     if((vbuf = cbmapget(db->keycc, scores[i].word, scores[i].wsiz, NULL)) != NULL){
1460     cbmapmove(db->keycc, scores[i].word, scores[i].wsiz, FALSE);
1461     vsiz = *(int*)vbuf;
1462     } else {
1463     vsiz = est_idx_vsiz(db->idxdb, scores[i].word, scores[i].wsiz);
1464     cbmapput(db->keycc, scores[i].word, scores[i].wsiz, (char *)&vsiz, sizeof(int), FALSE);
1465     }
1466     scores[i].pt *= 400000.0 / (vsiz + 64);
1467     }
1468     if(db->kcmnum >= 0 && cbmaprnum(db->keycc) > db->kcmnum){
1469     num = db->kcmnum * 0.1 + 1;
1470     cbmapiterinit(db->keycc);
1471     for(i = 0; i < num && (word = cbmapiternext(db->keycc, &wsiz)) != NULL; i++){
1472     cbmapout(db->keycc, word, wsiz);
1473     }
1474     }
1475     qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
1476     }
1477     for(i = 0; i < snum && i < max; i++){
1478     vsiz = sprintf(numbuf, "%d", scores[i].pt);
1479     cbmapput(keys, scores[i].word, scores[i].wsiz, numbuf, vsiz, FALSE);
1480     }
1481     free(scores);
1482     cbmapclose(umap);
1483     cblistclose(words);
1484     return keys;
1485     }
1486    
1487    
1488     /* Initialize the iterator of a database. */
1489     int est_db_iter_init(ESTDB *db){
1490     assert(db);
1491     return vlcurfirst(db->listdb);
1492     }
1493    
1494    
1495     /* Get the next ID of the iterator of a database. */
1496     int est_db_iter_next(ESTDB *db){
1497     char *vbuf;
1498     int id;
1499     assert(db);
1500     if(!(vbuf = vlcurval(db->listdb, NULL))){
1501     if(dpecode == DP_ENOITEM){
1502     db->ecode = ESTENOITEM;
1503     return 0;
1504     } else {
1505     db->ecode = ESTEDB;
1506     db->fatal = TRUE;
1507     return -1;
1508     }
1509     }
1510     id = atoi(vbuf);
1511     free(vbuf);
1512     vlcurnext(db->listdb);
1513     return id;
1514     }
1515    
1516    
1517     /* Get the name of a database. */
1518     const char *est_db_name(ESTDB *db){
1519     assert(db);
1520     return db->name;
1521     }
1522    
1523    
1524     /* Get the number of documents in a database. */
1525     int est_db_doc_num(ESTDB *db){
1526     assert(db);
1527     return db->dnum;
1528     }
1529    
1530    
1531     /* Get the number of words in a database. */
1532     int est_db_word_num(ESTDB *db){
1533     assert(db);
1534     return vlrnum(db->fwmdb);
1535     }
1536    
1537    
1538     /* Get the size of a database. */
1539     double est_db_size(ESTDB *db){
1540     assert(db);
1541     return dpfsiz(db->metadb) + est_idx_size(db->idxdb) + vlfsiz(db->fwmdb) +
1542     crfsizd(db->attrdb) + crfsizd(db->textdb) + vlfsiz(db->listdb);
1543     }
1544    
1545    
1546     /* Search documents corresponding a condition for a database. */
1547     int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints){
1548     ESTSCORE *scores, *tscores;
1549     CBMAP *svmap;
1550     CBLIST *terms;
1551     const char *term, *rp;
1552     char *tmp, numbuf[ESTNUMBUFSIZ];
1553     int i, j, snum, pcnum, ncnum, tsnum, add, nnum, id, score, hnum, len, *rval;
1554     double tune;
1555     assert(db && cond && nump);
1556     scores = NULL;
1557     snum = 0;
1558     if(cond->phrase && cond->phrase[0] == ESTOPSIMILAR[0] &&
1559     cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
1560     rp = cond->phrase;
1561     rp += strlen(ESTOPSIMILAR);
1562     while(*rp > '\0' && *rp <= ' '){
1563     rp++;
1564     }
1565     svmap = est_phrase_vector(rp);
1566     scores = est_search_similar(db, svmap, &snum, ESTSMLRKNUM, ESTSMLRUNUM, cond->tfidf,
1567     cond->order ? ESTSMLRNMIN : 0.0);
1568     cbmapclose(svmap);
1569     } else if(cond->phrase){
1570     if(cond->simple){
1571     tmp = est_phrase_from_thumb(cond->phrase);
1572     terms = est_phrase_terms(tmp);
1573     free(tmp);
1574     } else {
1575     terms = est_phrase_terms(cond->phrase);
1576     }
1577     pcnum = 0;
1578     ncnum = 0;
1579     add = TRUE;
1580     for(i = 0; i < CB_LISTNUM(terms); i++){
1581     term = CB_LISTVAL(terms, i, NULL);
1582     if(!strcmp(term, ESTOPISECT)){
1583     add = TRUE;
1584     } else if(!strcmp(term, ESTOPDIFF)){
1585     add = FALSE;
1586     } else {
1587     if(!strcmp(term, ESTOPUVSET)){
1588     tscores = est_search_uvset(db, &tsnum, hints, add);
1589     } else {
1590     tscores = est_search_union(db, term, cond->gstep, &tsnum, hints, add);
1591     }
1592     if(add){
1593     if(cond->tfidf){
1594     tune = log(tsnum + 3);
1595     tune = tune * tune * tune;
1596     if(tune < 8.0) tune = 8.0;
1597     for(j = 0; j < tsnum; j++){
1598     tscores[j].score *= 10000 / tune;
1599     }
1600     }
1601     pcnum++;
1602     } else {
1603     ncnum++;
1604     }
1605     if(scores){
1606     CB_REALLOC(scores, (snum + tsnum) * sizeof(ESTSCORE) + 1);
1607     for(j = 0; j < tsnum; j++){
1608     scores[snum+j].id = tscores[j].id;
1609     scores[snum+j].score = add ? tscores[j].score : -1;
1610     }
1611     snum += tsnum;
1612     free(tscores);
1613     } else {
1614     scores = tscores;
1615     snum = tsnum;
1616     }
1617     }
1618     }
1619     if(scores){
1620     if(pcnum > 1 || ncnum > 0){
1621     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
1622     nnum = 0;
1623     for(i = 0; i < snum; i++){
1624     id = scores[i].id;
1625     score = scores[i].score;
1626     hnum = score >= 0 ? 1 : 0;
1627     for(j = i + 1; j < snum && scores[j].id == id; j++){
1628     if(score >= 0 && scores[j].score >= 0){
1629     score += scores[j].score;
1630     hnum++;
1631     } else {
1632     score = -1;
1633     }
1634     }
1635     if(score >= 0 && hnum >= pcnum){
1636     scores[nnum].id = id;
1637     scores[nnum].score = score;
1638     nnum++;
1639     }
1640     i = j - 1;
1641     }
1642     snum = nnum;
1643     }
1644     } else {
1645     CB_MALLOC(scores, 1);
1646     snum = 0;
1647     }
1648     cblistclose(terms);
1649     } else if(cond->attrs){
1650     scores = est_search_uvset(db, &snum, hints, TRUE);
1651     } else {
1652     CB_MALLOC(scores, 1);
1653     snum = 0;
1654     }
1655     if(cbmaprnum(db->outcc) > 0){
1656     tsnum = 0;
1657     for(i = 0; i < snum; i++){
1658     len = sprintf(numbuf, "\t%d", scores[i].id);
1659     if(cbmapget(db->outcc, numbuf, len, NULL)) continue;
1660     scores[tsnum++] = scores[i];
1661     }
1662     snum = tsnum;
1663     }
1664     if(cond->attrs || cond->order)
1665     snum = est_narrow_scores(db, cond->attrs, cond->order, scores, snum);
1666     if(!cond->order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score);
1667     if(hints){
1668     sprintf(numbuf, "%d", snum);
1669     cbmapput(hints, "", 0, numbuf, -1, FALSE);
1670     }
1671     if(cond->max >= 0 && cond->max < snum) snum = cond->max;
1672     CB_MALLOC(rval, snum * sizeof(int) + 1);
1673     for(i = 0; i < snum; i++){
1674     rval[i] = scores[i].id;
1675     }
1676     if(cond->scfb){
1677     CB_REALLOC(cond->scores, snum * sizeof(int) + 1);
1678     for(i = 0; i < snum; i++){
1679     cond->scores[i] = scores[i].score;
1680     }
1681     cond->snum = snum;
1682     }
1683     *nump = snum;
1684     if(*nump < 1) db->ecode = ESTENOITEM;
1685     free(scores);
1686     return rval;
1687     }
1688    
1689    
1690     /* Set the maximum size of the cache memory of a database. */
1691     void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum){
1692     assert(db);
1693     if(dpwritable(db->metadb) && size > 0) db->icmax = size;
1694     if(anum > 0) db->acmnum = anum;
1695     if(tnum > 0) db->tcmnum = tnum;
1696     }
1697    
1698    
1699     /* Set the special cache for narrowing and sorting with document attributes. */
1700     void est_db_set_special_cache(ESTDB *db, const char *name, int num){
1701     assert(db && name && num >= 0);
1702     if(db->spacc){
1703     free(db->scname);
1704     cbmapclose(db->spacc);
1705     }
1706     db->spacc = cbmapopenex(num + 1);
1707     db->scmnum = num;
1708     db->scname = cbmemdup(name, -1);
1709     }
1710    
1711    
1712    
1713     /*************************************************************************************************
1714     * features for experts
1715     *************************************************************************************************/
1716    
1717    
1718     /* Handle to the file of random number generator. */
1719     FILE *est_random_ifp = NULL;
1720    
1721    
1722     /* Break a sentence of text and extract words. */
1723     void est_break_text(const char *text, CBLIST *list, int norm, int tail){
1724     CBLIST *words;
1725     const unsigned char *word, *next;
1726     unsigned char *utext;
1727     char *tmp;
1728     int i, j, k, size, cc, wsiz, nsiz, tsiz;
1729     assert(text);
1730     utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
1731     if(norm) est_normalize_text(utext, size, &size);
1732     est_canonicalize_text(utext, size, FALSE);
1733     words = cblistopen();
1734     for(i = 0; i < size; i += 2){
1735     cc = est_char_category(utext[i] * 0x100 + utext[i+1]);
1736     for(j = i + 2; j < size; j += 2){
1737     if(est_char_category(utext[j] * 0x100 + utext[j+1]) != cc) break;
1738     }
1739     switch(cc){
1740     case ESTDELIMCHR:
1741     case ESTWESTALPH:
1742     cblistpush(words, (char *)(utext + i), j - i);
1743     break;
1744     case ESTEASTALPH:
1745     for(k = i; k < j; k += 2){
1746     if(j - k >= 4){
1747     cblistpush(words, (char *)(utext + k), 4);
1748     } else {
1749     cblistpush(words, (char *)(utext + k), 2);
1750     }
1751     }
1752     break;
1753     default:
1754     break;
1755     }
1756     i = j - 2;
1757     }
1758     for(i = 0; i < CB_LISTNUM(words); i++){
1759     word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1760     if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
1761     i < CB_LISTNUM(words) - 1){
1762     next = (unsigned char *)cblistval(words, i + 1, &nsiz);
1763     if(nsiz > 4) nsiz = 4;
1764     if(est_char_category(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
1765     CB_MALLOC(tmp, wsiz + nsiz + 1);
1766     memcpy(tmp, word, wsiz);
1767     memcpy(tmp + wsiz, next, nsiz);
1768     cblistover(words, i, tmp, wsiz + nsiz);
1769     free(tmp);
1770     }
1771     }
1772     for(i = 0; i < CB_LISTNUM(words); i++){
1773     word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1774     if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
1775     if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
1776     }
1777     tmp = est_uconv_out((char *)word, wsiz, &tsiz);
1778     cblistpushbuf(list, tmp, tsiz);
1779     }
1780     cblistclose(words);
1781     free(utext);
1782     }
1783    
1784    
1785     /* Break a sentence of text and extract words using perfect N-gram analyzer. */
1786     void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail){
1787     CBLIST *words;
1788     const unsigned char *word, *next;
1789     unsigned char *utext;
1790     char *tmp;
1791     int i, j, k, size, cc, wsiz, nsiz, tsiz;
1792     assert(text);
1793     utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
1794     if(norm) est_normalize_text(utext, size, &size);
1795     est_canonicalize_text(utext, size, FALSE);
1796     words = cblistopen();
1797     for(i = 0; i < size; i += 2){
1798     cc = est_char_category_perfng(utext[i] * 0x100 + utext[i+1]);
1799     for(j = i + 2; j < size; j += 2){
1800     if(est_char_category_perfng(utext[j] * 0x100 + utext[j+1]) != cc) break;
1801     }
1802     switch(cc){
1803     case ESTEASTALPH:
1804     for(k = i; k < j; k += 2){
1805     if(j - k >= 4){
1806     cblistpush(words, (char *)(utext + k), 4);
1807     } else {
1808     cblistpush(words, (char *)(utext + k), 2);
1809     }
1810     }
1811     break;
1812     default:
1813     break;
1814     }
1815     i = j - 2;
1816     }
1817     for(i = 0; i < CB_LISTNUM(words); i++){
1818     word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1819     if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
1820     i < CB_LISTNUM(words) - 1){
1821     next = (unsigned char *)cblistval(words, i + 1, &nsiz);
1822     if(nsiz > 4) nsiz = 4;
1823     if(est_char_category_perfng(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
1824     CB_MALLOC(tmp, wsiz + nsiz + 1);
1825     memcpy(tmp, word, wsiz);
1826     memcpy(tmp + wsiz, next, nsiz);
1827     cblistover(words, i, tmp, wsiz + nsiz);
1828     free(tmp);
1829     }
1830     }
1831     for(i = 0; i < CB_LISTNUM(words); i++){
1832     word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1833     if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
1834     if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
1835     }
1836     tmp = est_uconv_out((char *)word, wsiz, &tsiz);
1837     cblistpushbuf(list, tmp, tsiz);
1838     }
1839     cblistclose(words);
1840     free(utext);
1841     }
1842    
1843    
1844     /* Convert the character encoding of a string. */
1845     char *est_iconv(const char *ptr, int size,
1846     const char *icode, const char *ocode, int *sp, int *mp){
1847     iconv_t ic;
1848     char *obuf, *wp, *rp;
1849     size_t isiz, osiz;
1850     int miss;
1851     assert(ptr && icode && ocode);
1852     if(size < 0) size = strlen(ptr);
1853     if(icode[0] == 'x' && icode[1] == '-'){
1854     if(!cbstricmp(icode, "x-sjis")){
1855     icode = "Shift_JIS";
1856     } else if(!cbstricmp(icode, "x-ujis")){
1857     icode = "EUC-JP";
1858     } else if(!cbstricmp(icode, "x-euc-jp")){
1859     icode = "EUC-JP";
1860     }
1861     } else if(icode[0] == 'w' || icode[0] == 'W'){
1862     if(!cbstricmp(icode, "windows-31j")){
1863     icode = "CP932";
1864     }
1865     }
1866     if(ocode[0] == 'x' && ocode[1] == '-'){
1867     if(!cbstricmp(ocode, "x-sjis")){
1868     ocode = "Shift_JIS";
1869     } else if(!cbstricmp(ocode, "x-ujis")){
1870     ocode = "EUC-JP";
1871     } else if(!cbstricmp(ocode, "x-euc-jp")){
1872     ocode = "EUC-JP";
1873     }
1874     } else if(ocode[0] == 'w' || ocode[0] == 'W'){
1875     if(!cbstricmp(ocode, "windows-31j")){
1876     ocode = "CP932";
1877     }
1878     }
1879     if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return NULL;
1880     isiz = size;
1881     osiz = isiz * 5;
1882     CB_MALLOC(obuf, osiz + 1);
1883     wp = obuf;
1884     rp = (char *)ptr;
1885     miss = 0;
1886     while(isiz > 0){
1887     if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
1888     if(errno == EILSEQ && (*rp == 0x5c || *rp == 0x7e)){
1889     *wp = *rp;
1890     wp++;
1891     rp++;
1892     isiz--;
1893     } else if(errno == EILSEQ || errno == EINVAL){
1894     rp++;
1895     isiz--;
1896     miss++;
1897     } else {
1898     break;
1899     }
1900     }
1901     }
1902     *wp = '\0';
1903     if(sp) *sp = wp - obuf;
1904     if(mp) *mp = miss;
1905     if(iconv_close(ic) == -1){
1906     free(obuf);
1907     return NULL;
1908     }
1909     return obuf;
1910     }
1911    
1912    
1913     /* Detect the encoding of a string automatically. */
1914     const char *est_enc_name(const char *ptr, int size, int plang){
1915     const char *hypo;
1916     int i, miss, cr;
1917     assert(ptr);
1918     if(size < 0) size = strlen(ptr);
1919     if(size > ESTICCHECKSIZ) size = ESTICCHECKSIZ;
1920     if(size >= 2 && (!memcmp(ptr, "\xfe\xff", 2) || !memcmp(ptr, "\xff\xfe", 2))) return "UTF-16";
1921     for(i = 0; i < size - 1; i += 2){
1922     if(ptr[i] == 0 && ptr[i+1] != 0) return "UTF-16BE";
1923     if(ptr[i+1] == 0 && ptr[i] != 0) return "UTF-16LE";
1924     }
1925     switch(plang){
1926     case ESTLANGEN:
1927     if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1928     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1929     return "ISO-8859-1";
1930     case ESTLANGJA:
1931     for(i = 0; i < size - 3; i++){
1932     if(ptr[i] == 0x1b){
1933     i++;
1934     if(ptr[i] == '(' && strchr("BJHI", ptr[i+1])) return "ISO-2022-JP";
1935     if(ptr[i] == '$' && strchr("@B(", ptr[i+1])) return "ISO-2022-JP";
1936     }
1937     }
1938     if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1939     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1940     hypo = NULL;
1941     cr = FALSE;
1942     for(i = 0; i < size; i++){
1943     if(ptr[i] == 0xd){
1944     cr = TRUE;
1945     break;
1946     }
1947     }
1948     if(cr){
1949     if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
1950     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
1951     if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
1952     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
1953     } else {
1954     if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
1955     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
1956     if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
1957     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
1958     }
1959     if((miss = est_enc_miss(ptr, size, "UTF-8", "UTF-16BE")) < 1) return "UTF-8";
1960     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "UTF-8";
1961     if((miss = est_enc_miss(ptr, size, "CP932", "UTF-16BE")) < 1) return "CP932";
1962     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "CP932";
1963     return hypo ? hypo : "ISO-8859-1";
1964     case ESTLANGZH:
1965     if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1966     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1967     if(est_enc_miss(ptr, size, "EUC-CN", "UTF-16BE") < 1) return "EUC-CN";
1968     if(est_enc_miss(ptr, size, "BIG5", "UTF-16BE") < 1) return "BIG5";
1969     return "ISO-8859-1";
1970     case ESTLANGKO:
1971     if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1972     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1973     if(est_enc_miss(ptr, size, "EUC-KR", "UTF-16BE") < 1) return "EUC-KR";
1974     return "ISO-8859-1";
1975     default:
1976     break;
1977     }
1978     return "ISO-8859-1";
1979     }
1980    
1981    
1982     /* Convert a UTF-8 string into UTF-16BE. */
1983     char *est_uconv_in(const char *ptr, int size, int *sp){
1984     const unsigned char *rp;
1985     char *rbuf, *wp;
1986     assert(ptr && size >= 0 && sp);
1987     rp = (unsigned char *)ptr;
1988     CB_MALLOC(rbuf, size * 2 + 1);
1989     wp = rbuf;
1990     while(rp < (unsigned char *)ptr + size){
1991     if(*rp < 0x7f){
1992     *(wp++) = 0x00;
1993     *(wp++) = *rp;
1994     rp += 1;
1995     } else if(*rp < 0xdf){
1996     if(rp >= (unsigned char *)ptr + size - 1) break;
1997     *(wp++) = (rp[0] & 0x1f) >> 2;
1998     *(wp++) = (rp[0] << 6) | (rp[1] & 0x3f);
1999     rp += 2;
2000     } else if(*rp < 0xf0){
2001     if(rp >= (unsigned char *)ptr + size - 2) break;
2002     *(wp++) = (rp[0] << 4) | ((rp[1] & 0x3f) >> 2);
2003     *(wp++) = (rp[1] << 6) | (rp[2] & 0x3f);
2004     rp += 3;
2005     } else if(*rp < 0xf8){
2006     if(rp >= (unsigned char *)ptr + size - 3) break;
2007     *(wp++) = 0x00;
2008     *(wp++) = '?';
2009     rp += 4;
2010     } else if(*rp < 0xfb){
2011     if(rp >= (unsigned char *)ptr + size - 4) break;
2012     *(wp++) = 0x00;
2013     *(wp++) = '?';
2014     rp += 5;
2015     } else if(*rp < 0xfd){
2016     if(rp >= (unsigned char *)ptr + size - 5) break;
2017     *(wp++) = 0x00;
2018     *(wp++) = '?';
2019     rp += 6;
2020     } else {
2021     break;
2022     }
2023     }
2024     *wp = '\0';
2025     *sp = wp - rbuf;
2026     return rbuf;
2027     }
2028    
2029    
2030     /* Convert a UTF-16BE string into UTF-8. */
2031     char *est_uconv_out(const char *ptr, int size, int *sp){
2032     const unsigned char *rp;
2033     char *rbuf, *wp;
2034     int c;
2035     assert(ptr && size >= 0);
2036     if(size % 2 != 0) size--;
2037     rp = (unsigned char *)ptr;
2038     CB_MALLOC(rbuf, size * 2 + 1);
2039     wp = rbuf;
2040     while(rp < (unsigned char *)ptr + size){
2041     c = rp[0] * 0x100 + rp[1];
2042     if(c < 0x0080){
2043     *(wp++) = rp[1];
2044     } else if(c < 0x0900){
2045     *(wp++) = 0xc0 | (rp[0] << 2) | ((rp[1] >> 6) & 0x03);
2046     *(wp++) = 0x80 | (rp[1] & 0x3f);
2047     } else {
2048     *(wp++) = 0xe0 | ((rp[0] >> 4) & 0x0f);
2049     *(wp++) = 0x80 | ((rp[0] & 0x0f) << 2) | ((rp[1] >> 6) & 0x03);
2050     *(wp++) = 0x80 | (rp[1] & 0x3f);
2051     }
2052     rp += 2;
2053     }
2054     *wp = '\0';
2055     if(sp) *sp = wp - rbuf;
2056     return rbuf;
2057     }
2058    
2059    
2060     /* Compress a serial object with ZLIB. */
2061     char *est_deflate(const char *ptr, int size, int *sp){
2062     z_stream zs;
2063     char *buf;
2064     unsigned char obuf[ESTIOBUFSIZ];
2065     int rv, asiz, bsiz, osiz;
2066     assert(ptr && sp);
2067     if(size < 0) size = strlen(ptr);
2068     zs.zalloc = Z_NULL;
2069     zs.zfree = Z_NULL;
2070     zs.opaque = Z_NULL;
2071     if(deflateInit(&zs, ESTZCOMPLEVEL) != Z_OK) return NULL;
2072     asiz = ESTIOBUFSIZ;
2073     CB_MALLOC(buf, asiz);
2074     bsiz = 0;
2075     zs.next_in = (unsigned char *)ptr;
2076     zs.avail_in = size;
2077     zs.next_out = obuf;
2078     zs.avail_out = ESTIOBUFSIZ;
2079     while((rv = deflate(&zs, Z_FINISH)) == Z_OK){
2080     osiz = ESTIOBUFSIZ - zs.avail_out;
2081     if(bsiz + osiz > asiz){
2082     asiz = asiz * 2 + osiz;
2083     CB_REALLOC(buf, asiz);
2084     }
2085     memcpy(buf + bsiz, obuf, osiz);
2086     bsiz += osiz;
2087     zs.next_out = obuf;
2088     zs.avail_out = ESTIOBUFSIZ;
2089     }
2090     if(rv != Z_STREAM_END){
2091     free(buf);
2092     deflateEnd(&zs);
2093     return NULL;
2094     }
2095     osiz = ESTIOBUFSIZ - zs.avail_out;
2096     if(bsiz + osiz > asiz){
2097     asiz = asiz * 2 + osiz;
2098     CB_REALLOC(buf, asiz);
2099     }
2100     memcpy(buf + bsiz, obuf, osiz);
2101     bsiz += osiz;
2102     *sp = bsiz;
2103     deflateEnd(&zs);
2104     return buf;
2105     }
2106    
2107    
2108     /* Decompress a serial object compressed with ZLIB. */
2109     char *est_inflate(const char *ptr, int size, int *sp){
2110     z_stream zs;
2111     char *buf;
2112     unsigned char obuf[ESTIOBUFSIZ];
2113     int rv, asiz, bsiz, osiz;
2114     assert(ptr && size >= 0 && sp);
2115     zs.zalloc = Z_NULL;
2116     zs.zfree = Z_NULL;
2117     zs.opaque = Z_NULL;
2118     if(inflateInit(&zs) != Z_OK) return NULL;
2119     asiz = ESTIOBUFSIZ;
2120     CB_MALLOC(buf, asiz);
2121     bsiz = 0;
2122     zs.next_in = (unsigned char *)ptr;
2123     zs.avail_in = size;
2124     zs.next_out = obuf;
2125     zs.avail_out = ESTIOBUFSIZ;
2126     while((rv = inflate(&zs, Z_NO_FLUSH)) == Z_OK){
2127     osiz = ESTIOBUFSIZ - zs.avail_out;
2128     if(bsiz + osiz >= asiz){
2129     asiz = asiz * 2 + osiz;
2130     CB_REALLOC(buf, asiz);
2131     }
2132     memcpy(buf + bsiz, obuf, osiz);
2133     bsiz += osiz;
2134     zs.next_out = obuf;
2135     zs.avail_out = ESTIOBUFSIZ;
2136     }
2137     if(rv != Z_STREAM_END){
2138     free(buf);
2139     inflateEnd(&zs);
2140     return NULL;
2141     }
2142     osiz = ESTIOBUFSIZ - zs.avail_out;
2143     if(bsiz + osiz >= asiz){
2144     asiz = asiz * 2 + osiz;
2145     CB_REALLOC(buf, asiz);
2146     }
2147     memcpy(buf + bsiz, obuf, osiz);
2148     bsiz += osiz;
2149     buf[bsiz] = '\0';
2150     if(sp) *sp = bsiz;
2151     inflateEnd(&zs);
2152     return buf;
2153     }
2154    
2155    
2156     /* Get the border string for draft data of documents. */
2157     const char *est_border_str(void){
2158     static int first = TRUE;
2159     static char border[ESTPATHBUFSIZ];
2160     int t, p;
2161     if(first){
2162     t = (int)(time(NULL) + est_random() * INT_MAX);
2163     p = (int)(getpid() + est_random() * INT_MAX);
2164     sprintf(border, "--------[%08X%08X]--------",
2165     dpouterhash((char *)&t, sizeof(int)), dpouterhash((char *)&p, sizeof(int)));
2166     first = FALSE;
2167     }
2168     return border;
2169     }
2170    
2171    
2172     /* Get the real random number. */
2173     double est_random(void){
2174     static int first = TRUE;
2175     int num;
2176     if(first && !est_random_ifp){
2177     if((est_random_ifp = fopen("/dev/urandom", "rb")) != NULL){
2178     atexit(est_random_fclose);
2179     } else {
2180     srand(getpid());
2181     }
2182     first = FALSE;
2183     }
2184     if(est_random_ifp){
2185     fread(&num, sizeof(int), 1, est_random_ifp);
2186     return (num & 0x7fffffff) / (double)0x7fffffff;
2187     }
2188     return rand() / (double)RAND_MAX;
2189     }
2190    
2191    
2192     /* Get the random number in normal distribution. */
2193     double est_random_nd(void){
2194     double d;
2195     d = (sqrt(-2 * log(1.0 - est_random())) * cos(3.1415926535 * 2 * est_random()) + 6.0) / 12.0;
2196     if(d > 1.0) d = 1.0;
2197     if(d < 0.0) d = 0.0;
2198     return d;
2199     }
2200    
2201    
2202     /* Get an MD5 hash string of a key string. */
2203     char *est_make_crypt(const char *key){
2204     md5_state_t ms;
2205     char digest[32], str[64], *wp;
2206     int i;
2207     assert(key);
2208     md5_init(&ms);
2209     md5_append(&ms, (md5_byte_t *)key, strlen(key));
2210     md5_finish(&ms, (md5_byte_t *)digest);
2211     wp = str;
2212     for(i = 0; i < 16; i++){
2213     wp += sprintf(wp, "%02x", ((unsigned char *)digest)[i]);
2214     }
2215     return cbmemdup(str, -1);
2216     }
2217    
2218    
2219     /* Check whether a key matches an MD5 hash string. */
2220     int est_match_crypt(const char *key, const char *hash){
2221     char *khash;
2222     int rv;
2223     assert(key && hash);
2224     khash = est_make_crypt(key);
2225     rv = !strcmp(khash, hash);
2226     free(khash);
2227     return rv;
2228     }
2229    
2230    
2231     /* Get the hidden texts of a document object. */
2232     const char *est_doc_hidden_texts(ESTDOC *doc){
2233     const char *rv;
2234     assert(doc);
2235     rv = doc->attrs ? cbmapget(doc->attrs, "", 0, NULL) : NULL;
2236     return rv ? rv : "";
2237     }
2238    
2239    
2240     /* Get the phrase of a condition object. */
2241     const char *est_cond_phrase(ESTCOND *cond){
2242     assert(cond);
2243     return cond->phrase;
2244     }
2245    
2246    
2247     /* Get a list object of attribute expressions of a condition object. */
2248     const CBLIST *est_cond_attrs(ESTCOND *cond){
2249     assert(cond);
2250     return cond->attrs;
2251     }
2252    
2253    
2254     /* Get the order expression of a condition object. */
2255     const char *est_cond_order(ESTCOND *cond){
2256     assert(cond);
2257     return cond->order;
2258     }
2259    
2260    
2261     /* Get the maximum number of retrieval of a condition object. */
2262     int est_cond_max(ESTCOND *cond){
2263     assert(cond);
2264     return cond->max;
2265     }
2266    
2267    
2268     /* Get the options of a condition object. */
2269     int est_cond_options(ESTCOND *cond){
2270     assert(cond);
2271     return cond->opts;
2272     }
2273    
2274    
2275     /* Get the score of a document corresponding to a condition object. */
2276     int est_cond_score(ESTCOND *cond, int index){
2277     assert(cond);
2278     if(!cond->scores || index < 0 || index >= cond->snum) return -1;
2279     return cond->scores[index];
2280     }
2281    
2282    
2283     /* Set the error code of a database. */
2284     void est_db_set_ecode(ESTDB *db, int ecode){
2285     assert(db);
2286     db->ecode = ecode;
2287     }
2288    
2289    
2290     /* Edit attributes of a document object in a database. */
2291     int est_db_edit_doc(ESTDB *db, ESTDOC *doc){
2292     const char *uri;
2293     char *sbuf;
2294     int err, id, ssiz;
2295     assert(db && doc);
2296     if(!dpwritable(db->metadb)){
2297     db->ecode = ESTEACCES;
2298     return FALSE;
2299     }
2300     if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) || doc->id < 1){
2301     db->ecode = ESTEINVAL;
2302     return FALSE;
2303     }
2304     if((id = est_db_uri_to_id(db, uri)) > 0 && id != doc->id){
2305     db->ecode = ESTEINVAL;
2306     return FALSE;
2307     }
2308     err = FALSE;
2309     sbuf = cbmapdump(doc->attrs, &ssiz);
2310     if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DOVER)){
2311     db->ecode = ESTEDB;
2312     db->fatal = TRUE;
2313     err = TRUE;
2314     }
2315     free(sbuf);
2316     if(db->spacc) cbmapout(db->spacc, (char *)&(doc->id), sizeof(int));
2317     return err ? FALSE : TRUE;
2318     }
2319    
2320    
2321     /* Add a piece of meta data to a database. */
2322     void est_db_add_meta(ESTDB *db, const char *name, const char *value){
2323     assert(db && name);
2324     if(!dpwritable(db->metadb)){
2325     db->ecode = ESTEACCES;
2326     return;
2327     }
2328     if(!db->metacc) est_db_prepare_meta(db);
2329     if(value){
2330     cbmapput(db->metacc, name, -1, value, -1, TRUE);
2331     } else {
2332     cbmapout(db->metacc, name, -1);
2333     }
2334     }
2335    
2336    
2337     /* Get a list of names of meta data of a database. */
2338     CBLIST *est_db_meta_names(ESTDB *db){
2339     assert(db);
2340     if(!db->metacc) est_db_prepare_meta(db);
2341     return cbmapkeys(db->metacc);
2342     }
2343    
2344    
2345     /* Get the value of a piece of meta data of a database. */
2346     char *est_db_meta(ESTDB *db, const char *name){
2347     const char *vbuf;
2348     int vsiz;
2349     assert(db && name);
2350     if(!db->metacc) est_db_prepare_meta(db);
2351     if(!(vbuf = cbmapget(db->metacc, name, -1, &vsiz))) return NULL;
2352     return cbmemdup(vbuf, vsiz);
2353     }
2354    
2355    
2356     /* Get the number of records in the cache memory of a database. */
2357     int est_db_cache_num(ESTDB *db){
2358     assert(db);
2359     return cbmaprnum(db->idxcc);
2360     }
2361    
2362    
2363     /* Set the callback function for database events. */
2364     void est_db_set_informer(ESTDB *db, void (*func)(const char *)){
2365     assert(db && func);
2366     db->cbinfo = func;
2367     est_db_inform(db, "status");
2368     }
2369    
2370    
2371     /* Set the callback function to create a vector of keywords of a document. */
2372     void est_db_set_vectorizer(ESTDB *db, CBMAP *(*func)(void *, int, void *), void *data){
2373     assert(db && func);
2374     db->cbvec = func;
2375     db->vecdata = data;
2376     }
2377    
2378    
2379     /* Fill the cache for keys for TF-IDF. */
2380     void est_db_fill_key_cache(ESTDB *db){
2381     char *kbuf, *msg;
2382     int i, ksiz, vsiz;
2383     assert(db);
2384     vlcurfirst(db->fwmdb);
2385     for(i = 0; (kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL; i++){
2386     vsiz = est_idx_vsiz(db->idxdb, kbuf, ksiz);
2387     cbmapput(db->keycc, kbuf, ksiz, (char *)&vsiz, sizeof(int), TRUE);
2388     free(kbuf);
2389     vlcurnext(db->fwmdb);
2390     if(i % ESTCCCBFREQ == 0){
2391     msg = cbsprintf("filling the key cache for TF-IDF (%d)", i + 1);
2392     est_db_inform(db, msg);
2393     free(msg);
2394     }
2395     }
2396     db->kcmnum = -1;
2397     }
2398    
2399    
2400     /* Make a directory. */
2401     int est_mkdir(const char *path){
2402     #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2403     return mkdir(path) == 0 ? TRUE : FALSE;
2404     #else
2405     assert(path);
2406     return mkdir(path, ESTDIRMODE) == 0 ? TRUE : FALSE;
2407     #endif
2408     }
2409    
2410    
2411     /* Remove a directory and its contents recursively. */
2412     int est_rmdir_rec(const char *path){
2413     CBLIST *files;
2414     const char *file;
2415     char pbuf[ESTPATHBUFSIZ];
2416     int i;
2417     assert(path);
2418     if((files = cbdirlist(path)) != NULL){
2419     for(i = 0; i < cblistnum(files); i++){
2420     file = cblistval(files, i, NULL);
2421     if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
2422     sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
2423     if(unlink(pbuf) == -1) est_rmdir_rec(pbuf);
2424     }
2425     cblistclose(files);
2426     }
2427     return rmdir(path) == 0;
2428     }
2429    
2430    
2431     /* Get the canonicalized absolute pathname of a file. */
2432     char *est_realpath(const char *path){
2433     #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2434     char pbuf[ESTPATHBUFSIZ], *p;
2435     if(GetFullPathName(path, ESTPATHBUFSIZ, pbuf, &p) == 0) sprintf(pbuf, "%s", path);
2436     return cbmemdup(pbuf, -1);
2437     #else
2438     char pbuf[ESTPATHBUFSIZ*2];
2439     assert(path);
2440     if(!realpath(path, pbuf)) sprintf(pbuf, "%s", path);
2441     return cbmemdup(pbuf, -1);
2442     #endif
2443     }
2444    
2445    
2446     /* Get the time of day in milliseconds. */
2447     double est_gettimeofday(void){
2448     #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2449     SYSTEMTIME st;
2450     struct tm ts;
2451     GetLocalTime(&st);
2452     memset(&ts, 0, sizeof(struct tm));
2453     ts.tm_year = st.wYear - 1900;
2454     ts.tm_mon = st.wMonth - 1;
2455     ts.tm_mday = st.wDay;
2456     ts.tm_hour = st.wHour;
2457     ts.tm_min = st.wMinute;
2458     ts.tm_sec = st.wSecond;
2459     return (double)mktime(&ts) * 1000 + (double)st.wMilliseconds;
2460     #else
2461     struct timeval tv;
2462     struct timezone tz;
2463     if(gettimeofday(&tv, &tz) == -1) return 0.0;
2464     return (double)tv.tv_sec * 1000 + (double)tv.tv_usec / 1000;
2465     #endif
2466     }
2467    
2468    
2469     /* Suspend execution for microsecond intervals. */
2470     void est_usleep(unsigned long usec){
2471     #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2472     Sleep(usec / 1000);
2473     #else
2474     usleep(usec);
2475     #endif
2476     }
2477    
2478    
2479     /* Send a signal to a process. */
2480     int est_kill(int pid, int sig){
2481     #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2482     return FALSE;
2483     #else
2484     return kill(pid, sig) == 0;
2485     #endif
2486     }
2487    
2488    
2489     /* get the media type of an extention */
2490     const char *est_ext_type(const char *ext){
2491     static const char *list[] = {
2492     ".txt", "text/plain", ".txt.en", "text/plain",
2493     ".txt.ja", "text/plain", ".asc", "text/plain",
2494     ".in", "text/plain", ".c", "text/plain",
2495     ".h", "text/plain", ".cc", "text/plain",
2496     ".java", "text/plain", ".sh", "text/plain",
2497     ".pl", "text/plain", ".py", "text/plain",
2498     ".rb", "text/plain", ".idl", "text/plain",
2499     ".csv", "text/plain", ".log", "text/plain",
2500     ".conf", "text/plain", ".rc", "text/plain",
2501     ".ini", "text/plain", ".html", "text/html",
2502     ".htm", "text/html", ".xhtml", "text/html",
2503     ".xht", "text/html", ".css", "text/css",
2504     ".js", "text/javascript", ".tsv", "text/tab-separated-values",
2505     ".eml", "message/rfc822", ".mime", "message/rfc822",
2506     ".mht", "message/rfc822", ".mhtml", "message/rfc822",
2507     ".sgml", "application/sgml", ".sgm", "application/sgml",
2508     ".xml", "application/xml", ".xsl", "application/xml",
2509     ".xslt", "application/xslt+xml", ".xhtml", "application/xhtml+xml",
2510     ".xht", "application/xhtml+xml", ".rdf", "application/rdf+xml",
2511     ".rss", "application/rss+xml", ".dtd", "application/xml-dtd",
2512     ".rtf", "application/rtf", ".pdf", "application/pdf",
2513     ".ps", "application/postscript", ".eps", "application/postscript",
2514     ".doc", "application/msword", ".xls", "application/vnd.ms-excel",
2515     ".ppt", "application/vnd.ms-powerpoint", ".xdw", "application/vnd.fujixerox.docuworks",
2516     ".swf", "application/x-shockwave-flash", ".zip", "application/zip",
2517     ".tar", "application/x-tar", ".gz", "application/x-gzip",
2518     ".bz2", "application/octet-stream", ".z", "application/octet-stream",
2519     ".lha", "application/octet-stream", ".lzh", "application/octet-stream",
2520     ".cab", "application/octet-stream", ".rar", "application/octet-stream",
2521     ".sit", "application/octet-stream", ".bin", "application/octet-stream",
2522     ".o", "application/octet-stream", ".so", "application/octet-stream",
2523     ".exe", "application/octet-stream", ".dll", "application/octet-stream",
2524     ".class", "application/octet-stream", ".png", "image/png",
2525     ".gif", "image/gif", ".jpg", "image/jpeg",
2526     ".jpeg", "image/jpeg", ".tif", "image/tiff",
2527     ".tiff", "image/tiff", ".bmp", "image/bmp",
2528     ".au", "audio/basic", ".snd", "audio/basic",
2529     ".mid", "audio/midi", ".midi", "audio/midi",
2530     ".mp2", "audio/mpeg", ".mp3", "audio/mpeg",
2531     ".wav", "audio/x-wav", ".mpg", "video/mpeg",
2532     ".mpeg", "video/mpeg", ".qt", "video/quicktime",
2533     ".mov", "video/quicktime", ".avi", "video/x-msvideo",
2534     NULL
2535     };
2536     int i;
2537     assert(ext);
2538     for(i = 0; list[i]; i++){
2539     if(!cbstricmp(ext, list[i])) return list[i+1];
2540     }
2541     return "application/octet-stream";
2542     }
2543    
2544    
2545    
2546     /*************************************************************************************************
2547     * private objects
2548     *************************************************************************************************/
2549    
2550    
2551     /* Count the number of missing characters when converting.
2552     `ptr' specifies the pointer to a region.
2553     `size' specifies the size of the region.
2554     `icode' specifies the name of encoding of the input string.
2555     `ocode' specifies the name of encoding of the output string.
2556     The return value is the number of missing characters. */
2557     static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode){
2558     iconv_t ic;
2559     char obuf[ESTICCHECKSIZ], *wp, *rp;
2560     size_t isiz, osiz;
2561     int miss;
2562     assert(ptr && size >= 0 && icode && ocode);
2563     isiz = size;
2564     if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ESTICMISSMAX;
2565     miss = 0;
2566     rp = (char *)ptr;
2567     while(isiz > 0){
2568     osiz = ESTICCHECKSIZ;
2569     wp = obuf;
2570     if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
2571     if(errno == EILSEQ || errno == EINVAL){
2572     rp++;
2573     isiz--;
2574     miss++;
2575     if(miss >= ESTICMISSMAX) break;
2576     } else {
2577     break;
2578     }
2579     }
2580     }
2581     if(iconv_close(ic) == -1) return ESTICMISSMAX;
2582     return miss;
2583     }
2584    
2585    
2586     /* Normalize a text.
2587     `utext' specifies a text whose encoding is UTF-16BE.
2588     `size' specifies the size of the text.
2589     `sp' specifies the pointer to a variable to which the size of the result is assigned. */
2590     static void est_normalize_text(unsigned char *utext, int size, int *sp){
2591     int i, wi;
2592     assert(utext && size >= 0 && sp);
2593     wi = 0;
2594     for(i = 0; i < size - 1; i += 2){
2595     if(utext[i] == 0x0 && (utext[i+1] <= 0x8 || (utext[i+1] >= 0x0e && utext[i+1] <= 0x1f))){
2596     /* control characters */
2597     utext[wi] = 0x0;
2598     utext[wi+1] = 0x20;
2599     } else if(utext[i] == 0x0 && utext[i+1] == 0xa0){
2600     /* no-break space */
2601     utext[wi] = 0x0;
2602     utext[wi+1] = 0x20;
2603     } else if(utext[i] == 0x20 && utext[i+1] == 0x2){
2604     /* en space */
2605     utext[wi] = 0x0;
2606     utext[wi+1] = 0x20;
2607     } else if(utext[i] == 0x20 && utext[i+1] == 0x3){
2608     /* em space */
2609     utext[wi] = 0x0;
2610     utext[wi+1] = 0x20;
2611     } else if(utext[i] == 0x20 && utext[i+1] == 0x9){
2612     /* thin space */
2613     utext[wi] = 0x0;
2614     utext[wi+1] = 0x20;
2615     } else if(utext[i] == 0x30 && utext[i+1] == 0x0){
2616     /* fullwidth space */
2617     utext[wi] = 0x0;
2618     utext[wi+1] = 0x20;
2619     } else if(utext[i] == 0xff){
2620     if(utext[i+1] >= 0x21 && utext[i+1] <= 0x3a){
2621     /* fullwidth alphabets */
2622     utext[wi] = 0x0;
2623     utext[wi+1] = utext[i+1] - 0x21 + 0x41;
2624     } else if(utext[i+1] >= 0x41 && utext[i+1] <= 0x5a){
2625     /* fullwidth small alphabets */
2626     utext[wi] = 0x0;
2627     utext[wi+1] = utext[i+1] - 0x41 + 0x61;
2628     } else if(utext[i+1] >= 0x10 && utext[i+1] <= 0x19){
2629     /* fullwidth numbers */
2630     utext[wi] = 0x0;
2631     utext[wi+1] = utext[i+1] - 0x10 + 0x30;
2632     } else if(utext[i+1] == 0x61){
2633     /* halfwidth full stop */
2634     utext[wi] = 0x30;
2635     utext[wi+1] = 0x2;
2636     } else if(utext[i+1] == 0x62){
2637     /* halfwidth left corner */
2638     utext[wi] = 0x30;
2639     utext[wi+1] = 0xc;
2640     } else if(utext[i+1] == 0x63){
2641     /* halfwidth right corner */
2642     utext[wi] = 0x30;
2643     utext[wi+1] = 0xd;
2644     } else if(utext[i+1] == 0x64){
2645     /* halfwidth comma */
2646     utext[wi] = 0x30;
2647     utext[wi+1] = 0x1;
2648     } else if(utext[i+1] == 0x65){
2649     /* halfwidth middle dot */
2650     utext[wi] = 0x30;
2651     utext[wi+1] = 0xfb;
2652     } else if(utext[i+1] == 0x66){
2653     /* halfwidth wo */
2654     utext[wi] = 0x30;
2655     utext[wi+1] = 0xf2;
2656     } else if(utext[i+1] >= 0x67 && utext[i+1] <= 0x6b){
2657     /* halfwidth small a-o */
2658     utext[wi] = 0x30;
2659     utext[wi+1] = (utext[i+1] - 0x67) * 2 + 0xa1;
2660     } else if(utext[i+1] >= 0x6c && utext[i+1] <= 0x6e){
2661     /* halfwidth small ya-yo */
2662     utext[wi] = 0x30;
2663     utext[wi+1] = (utext[i+1] - 0x6c) * 2 + 0xe3;
2664     } else if(utext[i+1] == 0x6f){
2665     /* halfwidth small tu */
2666     utext[wi] = 0x30;
2667     utext[wi+1] = 0xc3;
2668     } else if(utext[i+1] == 0x70){
2669     /* halfwidth prolonged mark */
2670     utext[wi] = 0x30;
2671     utext[wi+1] = 0xfc;
2672     } else if(utext[i+1] >= 0x71 && utext[i+1] <= 0x75){
2673     /* halfwidth a-o */
2674     utext[wi] = 0x30;
2675     utext[wi+1] = (utext[i+1] - 0x71) * 2 + 0xa2;
2676     if(i + 2 < size - 1 && utext[i+1] == 0x73 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2677     utext[wi+1] = 0xf4;
2678     i += 2;
2679     }
2680     } else if(utext[i+1] >= 0x76 && utext[i+1] <= 0x7a){
2681     /* halfwidth ka-ko */
2682     utext[wi] = 0x30;
2683     utext[wi+1] = (utext[i+1] - 0x76) * 2 + 0xab;
2684     if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2685     utext[wi+1] += 1;
2686     i += 2;
2687     }
2688     } else if(utext[i+1] >= 0x7b && utext[i+1] <= 0x7f){
2689     /* halfwidth sa-so */
2690     utext[wi] = 0x30;
2691     utext[wi+1] = (utext[i+1] - 0x7b) * 2 + 0xb5;
2692     if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2693     utext[wi+1] += 1;
2694     i += 2;
2695     }
2696     } else if(utext[i+1] >= 0x80 && utext[i+1] <= 0x84){
2697     /* halfwidth ta-to */
2698     utext[wi] = 0x30;
2699     utext[wi+1] = (utext[i+1] - 0x80) * 2 + 0xbf + (utext[i+1] >= 0x82 ? 1 : 0);
2700     if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2701     utext[wi+1] += 1;
2702     i += 2;
2703     }
2704     } else if(utext[i+1] >= 0x85 && utext[i+1] <= 0x89){
2705     /* halfwidth na-no */
2706     utext[wi] = 0x30;
2707     utext[wi+1] = utext[i+1] - 0x85 + 0xca;
2708     } else if(utext[i+1] >= 0x8a && utext[i+1] <= 0x8e){
2709     /* halfwidth ha-ho */
2710     utext[wi] = 0x30;
2711     utext[wi+1] = (utext[i+1] - 0x8a) * 3 + 0xcf;
2712     if(i + 2 < size - 1){
2713     if(utext[i+2] == 0xff && utext[i+3] == 0x9e){
2714     utext[wi+1] += 1;
2715     i += 2;
2716     } else if(utext[i+2] == 0xff && utext[i+3] == 0x9f){
2717     utext[wi+1] += 2;
2718     i += 2;
2719     }
2720     }
2721     } else if(utext[i+1] >= 0x8f && utext[i+1] <= 0x93){
2722     /* halfwidth ma-mo */
2723     utext[wi] = 0x30;
2724     utext[wi+1] = utext[i+1] - 0x8f + 0xde;
2725     } else if(utext[i+1] >= 0x94 && utext[i+1] <= 0x96){
2726     /* halfwidth ya-yo */
2727     utext[wi] = 0x30;
2728     utext[wi+1] = (utext[i+1] - 0x94) * 2 + 0xe4;
2729     } else if(utext[i+1] >= 0x97 && utext[i+1] <= 0x9b){
2730     /* halfwidth ra-ro */
2731     utext[wi] = 0x30;
2732     utext[wi+1] = utext[i+1] - 0x97 + 0xe9;
2733     } else if(utext[i+1] == 0x9c){
2734     /* halfwidth wa */
2735     utext[wi] = 0x30;
2736     utext[wi+1] = 0xef;
2737     } else if(utext[i+1] == 0x9d){
2738     /* halfwidth wo */
2739     utext[wi] = 0x30;
2740     utext[wi+1] = 0xf3;
2741     } else {
2742     utext[wi] = utext[i];
2743     utext[wi+1] = utext[i+1];
2744     }
2745     } else {
2746     utext[wi] = utext[i];
2747     utext[wi+1] = utext[i+1];
2748     }
2749     wi += 2;
2750     }
2751     *sp = wi;
2752     }
2753    
2754    
2755     /* Canonicalize a text for search keys.
2756     `utext' specifies a text whose encoding is UTF-16BE.
2757     `size' specifies the size of the text.
2758     `funcspc' specifies whether to allow functional space characters. */
2759     static void est_canonicalize_text(unsigned char *utext, int size, int funcspc){
2760     int i;
2761     for(i = 0; i < size; i += 2){
2762     if(utext[i] == 0x0){
2763     if(utext[i+1] >= 'A' && utext[i+1] <= 'Z'){
2764     /* ascii */
2765     utext[i+1] += 'a' - 'A';
2766     } else if((utext[i+1] >= 0xc0 && utext[i+1] <= 0xd6) ||
2767     (utext[i+1] >= 0xd8 && utext[i+1] <= 0xde)){
2768     /* latin-1 supplement */
2769     utext[i+1] += 0x20;
2770     } else if(!funcspc && utext[i+1] < ' '){
2771     /* functional spaces */
2772     utext[i+1] = ' ';
2773     }
2774     } else if(utext[i] == 0x1){
2775     if((utext[i+1] <= 0x36 && utext[i+1] % 2 == 0) ||
2776     (utext[i+1] >= 0x39 && utext[i+1] <= 0x47 && utext[i+1] % 2 == 1) ||
2777     (utext[i+1] >= 0x4a && utext[i+1] <= 0x76 && utext[i+1] % 2 == 0) ||
2778     (utext[i+1] >= 0x79 && utext[i+1] <= 0x7d && utext[i+1] % 2 == 1)){
2779     /* latin extended-a */
2780     utext[i+1] += 0x1;
2781     } else if(utext[i+1] == 0x78){
2782     /* y with umlaut */
2783     utext[i] = 0x0;
2784     utext[i+1] = 0xff;
2785     }
2786     } else if(utext[i] == 0x3){
2787     if(utext[i+1] >= 0x91 && utext[i+1] <= 0xa9){
2788     /* greek */
2789     utext[i+1] += 0x20;
2790     }
2791     } else if(utext[i] == 0x4){
2792     if(utext[i+1] >= 0x10 && utext[i+1] <= 0x2f){
2793     /* cyrillic */
2794     utext[i+1] += 0x20;
2795     } else if(utext[i+1] <= 0x0f){
2796     /* cyrillic with mark */
2797     utext[i+1] += 0x50;
2798     }
2799     } else if(utext[i] == 0xff){
2800     if(utext[i] >= 0xf0){
2801     /* special */
2802     utext[i] = 0x0;
2803     utext[i+1] = ' ';
2804     }
2805     }
2806     }
2807     }
2808    
2809    
2810     /* Categorize a character.
2811     `c' specifies the UCS number of a character.
2812     The return value is the category of the character. */
2813     static int est_char_category(int c){
2814     /* ascii space */
2815     if(c <= 0x0020) return ESTSPACECHR;
2816     /* ascii alnum */
2817     if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
2818     (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
2819     /* latin */
2820     if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
2821     return ESTWESTALPH;
2822     /* arabic and syrian */
2823     if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
2824     /* south and south east asia */
2825     if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
2826     /* cjk */
2827     if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x2e80 && c <= 0xd7af) ||
2828     (c >= 0xf900 && c <= 0xfaff) || (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
2829     /* asian presentation forms */
2830     if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
2831     (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
2832     /* others */
2833     return ESTDELIMCHR;
2834     }
2835    
2836    
2837     /* Categorize a character for perfect N-gram analyzer.
2838     `c' specifies the UCS number of a character.
2839     The return value is the category of the character. */
2840     static int est_char_category_perfng(int c){
2841     if(c <= 0x0020) return ESTSPACECHR;
2842     return ESTEASTALPH;
2843     }
2844    
2845    
2846     /* Convert a simplified phrase into complete form.
2847     `sphrase' specifies a simplified phrase.
2848     The return value is the complete form of the phrase. */
2849     static char *est_phrase_from_thumb(const char *sphrase){
2850     CBDATUM *datum;
2851     const char *oper, *rp;
2852     unsigned char *utext;
2853     char *rtext;
2854     int size, quote;
2855     assert(sphrase);
2856     datum = cbdatumopen("", 0);
2857     utext = (unsigned char *)est_uconv_in(sphrase, strlen(sphrase), &size);
2858     est_normalize_text(utext, size, &size);
2859     est_canonicalize_text(utext, size, FALSE);
2860     rtext = est_uconv_out((char *)utext, size, NULL);
2861     cbstrsqzspc(rtext);
2862     quote = FALSE;
2863     oper = NULL;
2864     for(rp = rtext; *rp != '\0'; rp++){
2865     if(*rp == '"'){
2866     if(oper){
2867     cbdatumcat(datum, oper, -1);
2868     oper = NULL;
2869     }
2870     quote = !quote;
2871     continue;
2872     }
2873     if(quote){
2874     cbdatumcat(datum, rp, 1);
2875     continue;
2876     }
2877     switch(*rp){
2878     case ' ':
2879     if(!oper) oper = " AND ";
2880     break;
2881     case '&':
2882     oper = " AND ";
2883     break;
2884     case '|':
2885     oper = " OR ";
2886     break;
2887     case '!':
2888     oper = " ANDNOT ";
2889     break;
2890     default:
2891     if(oper){
2892     cbdatumcat(datum, oper, -1);
2893     oper = NULL;
2894     }
2895     cbdatumcat(datum, rp, 1);
2896     }
2897     }
2898     free(rtext);
2899     free(utext);
2900     return cbdatumtomalloc(datum, NULL);
2901     }
2902    
2903    
2904     /* Add a string to a snippet.
2905     `rtext' specifies a raw text.
2906     `ctext' specifies a canonicalized text.
2907     `size' specifies the size of the raw text and the canonicalized text.
2908     `awsiz' specifies the size of allowance for matching words.
2909     `res' specifies a datum object for the result.
2910     `rwords' specifies a list object of raw words. */
2911     static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
2912     int size, int awsiz, CBDATUM *res, const CBLIST *rwords){
2913     const unsigned char *rword;
2914     char *orig;
2915     int i, j, bi, rwsiz, step, osiz;
2916     bi = 0;
2917     for(i = 0; i < size; i += 2){
2918     for(j = 0; j < CB_LISTNUM(rwords); j++){
2919     rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
2920     if((step = est_str_fwmatch_wide(ctext + i, size + awsiz - i, rword, rwsiz)) > 0){
2921     if(i - bi > 0){
2922     orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
2923     cbdatumcat(res, orig, osiz);
2924     cbdatumcat(res, "\n", 1);
2925     free(orig);
2926     }
2927     orig = est_uconv_out((char *)rtext + i, step, &osiz);
2928     cbdatumcat(res, orig, osiz);
2929     free(orig);
2930     cbdatumcat(res, "\t", 1);
2931     orig = est_uconv_out((char *)rword, rwsiz, &osiz);
2932     cbdatumcat(res, orig, osiz);
2933     free(orig);
2934     cbdatumcat(res, "\n", 1);
2935     bi = i + step;
2936     i = bi - 2;
2937     break;
2938     }
2939     }
2940     }
2941     if(i - bi > 0){
2942     orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
2943     cbdatumcat(res, orig, osiz);
2944     cbdatumcat(res, "\n", 1);
2945     free(orig);
2946     }
2947     }
2948    
2949    
2950     /* Check whether a string begins with a key.
2951     `string' specifies a target string whose encoding is UTF-16BE.
2952     `size' specifies the size of the target string.
2953     `key' specifies a key string whose encoding is UTF-16BE.
2954     `ksiz' specifies the size of the key string.
2955     `key' specifies the pointer
2956     The return value is the number of characters of the corresponding string, or 0 if the target
2957     string does not begin with the key. */
2958     static int est_str_fwmatch_wide(const unsigned char *str, int size,
2959     const unsigned char *key, int ksiz){
2960     int si, ki;
2961     assert(str && size >= 0 && key && ksiz >= 0);
2962     if(size < 2 || ksiz < 2 || (str[0] == 0x0 && str[1] <= 0x20)) return 0;
2963     si = 0;
2964     ki = 0;
2965     while(ki < ksiz){
2966     if(si >= size) return 0;
2967     if(str[si] == 0x0 && str[si+1] <= 0x20){
2968     si += 2;
2969     continue;
2970     }
2971     if(key[ki] == 0x0 && key[ki+1] <= 0x20){
2972     ki += 2;
2973     continue;
2974     }
2975     if(str[si] != key[ki] || str[si+1] != key[ki+1]) return 0;
2976     si += 2;
2977     ki += 2;
2978     }
2979     return si;
2980     }
2981    
2982    
2983     /* Open the inverted index.
2984     `name' specifies the name of a directory.
2985     `omode' specifies an open mode of Villa.
2986     `dnum' specifies the number of database files.
2987     The return value is a database object of the database. */
2988     static ESTIDX *est_idx_open(const char *name, int omode, int dnum){
2989     ESTIDX *idx;
2990     CBLIST *files;
2991     char path[ESTPATHBUFSIZ];
2992     int i;
2993     assert(name && dnum > 0);
2994     if(dnum > ESTIDXDMAX) dnum = ESTIDXDMAX;
2995     CB_MALLOC(idx, sizeof(ESTIDX));
2996     if((omode & VL_OCREAT) && !est_mkdir(name) && errno != EEXIST) return NULL;
2997     if((omode & VL_OTRUNC) && (files = cbdirlist(name)) != NULL){
2998     for(i = 0; i < CB_LISTNUM(files); i++){
2999     sprintf(path, "%s%c%s", name, ESTPATHCHR, CB_LISTVAL(files, i, NULL));
3000     unlink(path);
3001     }
3002     cblistclose(files);
3003     }
3004     for(i = 0; i < dnum; i++){
3005     sprintf(path, "%s%c%04d", name, ESTPATHCHR, i + 1);
3006     if(!(idx->dbs[i] = vlopen(path, omode, VL_CMPLEX))){
3007     while(--i >= 0){
3008     vlclose(idx->dbs[i]);
3009     }
3010     return NULL;
3011     }
3012     }
3013     idx->name = cbmemdup(name, -1);
3014     idx->omode = omode;
3015     idx->dnum = dnum;
3016     idx->cdb = idx->dbs[dnum-1];
3017     return idx;
3018     }
3019    
3020    
3021     /* Close the inverted index.
3022     `idx' specifies an object of the inverted index.
3023     The return value is true if success, else it is false. */
3024     static int est_idx_close(ESTIDX *idx){
3025     int i, err;
3026     assert(idx);
3027     err = FALSE;
3028     for(i = 0; i < idx->dnum; i++){
3029     if(!vlclose(idx->dbs[i])) err = TRUE;
3030     }
3031     free(idx->name);
3032     free(idx);
3033     return err ? FALSE : TRUE;
3034     }
3035    
3036    
3037     /* Set the tuning parameters of the inverted index.
3038     `idx' specifies an object of the inverted index.
3039     Other parameters are same with `vlsettuning' of Villa. */
3040     static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum){
3041     int i;
3042     assert(idx);
3043     for(i = 0; i < idx->dnum; i++){
3044     vlsettuning(idx->dbs[i], lrecmax, nidxmax, lcnum, ncnum);
3045     }
3046     }
3047    
3048    
3049     /* Increment the inverted index.
3050     `idx' specifies an object of the inverted index. */
3051     static void est_idx_increment(ESTIDX *idx){
3052     char path[ESTPATHBUFSIZ];
3053     if(idx->dnum >= ESTIDXDMAX){
3054     est_idx_set_current(idx);
3055     return;
3056     }
3057     sprintf(path, "%s%c%04d", idx->name, ESTPATHCHR, idx->dnum + 1);
3058     if((idx->dbs[idx->dnum] = vlopen(path, idx->omode | VL_OCREAT | VL_OTRUNC, VL_CMPLEX)) != NULL){
3059     idx->cdb = idx->dbs[idx->dnum];
3060     idx->dnum++;
3061     }
3062     }
3063    
3064    
3065     /* Add a record to the inverted index.
3066     `idx' specifies an object of the inverted index.
3067     `word' specifies a word.
3068     `vbuf' specifies the pointer to the value of a record.
3069     `vsiz' specifies the size of the value.
3070     The return value is true if success, else it is false. */
3071     static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz){
3072     assert(idx && word && wsiz >= 0 && vbuf && vsiz >= 0);
3073     return vlput(idx->cdb, word, wsiz, vbuf, vsiz, VL_DDUP);
3074     }
3075    
3076    
3077     /* Remove a record from the inverted index.
3078     `idx' specifies an object of the inverted index.
3079     `word' specifies a word.
3080     `wsiz' specifies the size of the word.
3081     The return value is true if success, else it is false. Even if no item correspongs, it is
3082     success. */
3083     static int est_idx_out(ESTIDX *idx, const char *word, int wsiz){
3084     int i, err;
3085     assert(idx && word && wsiz >= 0);
3086     err = FALSE;
3087     for(i = 0; i < idx->dnum; i++){
3088     if(!vloutlist(idx->dbs[i], word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
3089     }
3090     return err ? FALSE : TRUE;
3091     }
3092    
3093    
3094     /* Get a record from the inverted index.
3095     `idx' specifies an object of the inverted index.
3096     `word' specifies a word.
3097     `wsiz' specifies the size of the word.
3098     `sp' specifies the pointer to a variable to which the size of the region of the return value
3099     is assigned.
3100     The return value is the pointer to the region of the value of the corresponding record.
3101     if no item correspongs, empty region is returned. */
3102     static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp){
3103     CBDATUM *datum;
3104     char *vbuf;
3105     int i, vsiz;
3106     assert(idx && word && wsiz >= 0 && sp);
3107     datum = cbdatumopen("", 0);
3108     for(i = 0; i < idx->dnum; i++){
3109     if(!(vbuf = vlgetcat(idx->dbs[i], word, wsiz, &vsiz))) continue;
3110     cbdatumcat(datum, vbuf, vsiz);
3111     free(vbuf);
3112     }
3113     return cbdatumtomalloc(datum, sp);
3114     }
3115    
3116    
3117     /* Get the size of the value of a record in the inverted index.
3118     `idx' specifies an object of the inverted index.
3119     `word' specifies a word.
3120     `wsiz' specifies the size of the word.
3121     The return value is the size of the value of the corresponding record.
3122     if no item correspongs, 0 is returned. */
3123     static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz){
3124     char *vbuf;
3125     int i, sum, vsiz;
3126     assert(idx && word && wsiz >= 0);
3127     sum = 0;
3128     for(i = 0; i < idx->dnum; i++){
3129     if(!(vbuf = vlgetcat(idx->dbs[i], word, wsiz, &vsiz))) continue;
3130     sum += vsiz;
3131     free(vbuf);
3132     }
3133     return sum;
3134     }
3135    
3136    
3137     /* Get the number of division of the inverted index.
3138     `idx' specifies an object of the inverted index.
3139     The return value is the number of division of the inverted index. */
3140     static int est_idx_num(ESTIDX *idx){
3141     assert(idx);
3142     return idx->dnum;
3143     }
3144    
3145    
3146     /* Get the size of the inverted index.
3147     `idx' specifies an object of the inverted index.
3148     The return value is the size of the inverted index. */
3149     static int est_idx_size(ESTIDX *idx){
3150     int i, size;
3151     assert(idx);
3152     size = 0;
3153     for(i = 0; i < idx->dnum; i++){
3154     size += vlfsiz(idx->dbs[i]);
3155     }
3156     return size;
3157     }
3158    
3159    
3160     /* Syncronize the inverted index.
3161     `idx' specifies an object of the inverted index.
3162     The return value is the size of the inverted index. */
3163     static int est_idx_sync(ESTIDX *idx){
3164     int i;
3165     assert(idx);
3166     for(i = 0; i < idx->dnum; i++){
3167     if(!vlsync(idx->dbs[i])) return FALSE;
3168     }
3169     return TRUE;
3170     }
3171    
3172    
3173     /* Optimize the inverted index.
3174     `idx' specifies an object of the inverted index.
3175     The return value is the size of the inverted index. */
3176     static int est_idx_optimize(ESTIDX *idx){
3177     int i;
3178     assert(idx);
3179     for(i = 0; i < idx->dnum; i++){
3180     if(!vloptimize(idx->dbs[i])) return FALSE;
3181     }
3182     return TRUE;
3183     }
3184    
3185    
3186     /* Set the current database to the smallest one in the inverted index.
3187     `idx' specifies an object of the inverted index. */
3188     static void est_idx_set_current(ESTIDX *idx){
3189     int i, size, min;
3190     assert(idx);
3191     min = vlfsiz(idx->cdb);
3192     for(i = 0; i < idx->dnum; i++){
3193     if((size = vlfsiz(idx->dbs[i])) < min){
3194     idx->cdb = idx->dbs[i];
3195     min = size;
3196     }
3197     }
3198     }
3199    
3200    
3201     /* Write meta data to the database.
3202     `db' specifies a database object.
3203     The return value is true if success, else it is false. */
3204     static int est_db_write_meta(ESTDB *db){
3205     char vbuf[ESTNUMBUFSIZ], *sbuf;
3206     int err, ssiz;
3207     assert(db);
3208     err = FALSE;
3209     sprintf(vbuf, "%d", est_idx_num(db->idxdb));
3210     if(!dpput(db->metadb, ESTKEYIDXNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3211     sprintf(vbuf, "%d", db->dseq);
3212     if(!dpput(db->metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3213     sprintf(vbuf, "%d", db->dnum);
3214     if(!dpput(db->metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3215     sprintf(vbuf, "%d", db->amode);
3216     if(!dpput(db->metadb, ESTKEYAMODE, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3217     if(db->metacc){
3218     sbuf = cbmapdump(db->metacc, &ssiz);
3219     if(!dpput(db->metadb, ESTKEYMETA, -1, sbuf, ssiz, DP_DOVER)) err = TRUE;
3220     free(sbuf);
3221     }
3222     if(err){
3223     db->ecode = ESTEDB;
3224     db->fatal = TRUE;
3225     }
3226     return err ? FALSE : TRUE;
3227     }
3228    
3229    
3230     /* Call the callback function of a database.
3231     `db' specifies a database object.
3232     `info' specifies an extra message. */
3233     static void est_db_inform(ESTDB *db, const char *info){
3234     char *msg;
3235     assert(db);
3236     if(!db->cbinfo) return;
3237     msg = cbsprintf("%s: name=%s dnum=%d wnum=%d fsiz=%.0f crnum=%d csiz=%.0f",
3238     info, db->name, db->dnum, vlrnum(db->fwmdb), (double)est_db_size(db),
3239     cbmaprnum(db->idxcc), (double)est_db_used_cache_size(db));
3240     db->cbinfo(msg);
3241     free(msg);
3242     }
3243    
3244    
3245     /* Get the size of used cache region.
3246     `db' specifies a database object.
3247     The return value is the size of used cache region. */
3248     static int est_db_used_cache_size(ESTDB *db){
3249     assert(db);
3250     return (db->icsiz + cbmaprnum(db->idxcc) * (sizeof(CBMAPDATUM) + ESTWORDAVGLEN)) * ESTMEMIRATIO;
3251     }
3252    
3253    
3254     /* Prepare cache for meta data.
3255     `db' specifies a database object. */
3256     static void est_db_prepare_meta(ESTDB *db){
3257     char *sbuf;
3258     int ssiz;
3259     assert(db);
3260     if((sbuf = dpget(db->metadb, ESTKEYMETA, -1, 0, -1, &ssiz)) != NULL){
3261     db->metacc = cbmapload(sbuf, ssiz);
3262     free(sbuf);
3263     } else {
3264     db->metacc = cbmapopenex(ESTMINIBNUM);
3265     }
3266     }
3267    
3268    
3269     /* Create a list of terms for search.
3270     `phrase' specifies a search phrase.
3271     The return value is a list object of the terms of the phrase. */
3272     static CBLIST *est_phrase_terms(const char *phrase){
3273     CBLIST *terms, *elems;
3274     CBDATUM *datum;
3275     const char *elem;
3276     char *tbuf, *pbuf;
3277     int i, tsiz, psiz, lw;
3278     assert(phrase);
3279     terms = cblistopen();
3280     tbuf = est_uconv_in(phrase, strlen(phrase), &tsiz);
3281     est_normalize_text((unsigned char *)tbuf, tsiz, &tsiz);
3282     pbuf = est_uconv_out(tbuf, tsiz, &psiz);
3283     elems = cbsplit(pbuf, psiz, "\a\b\t\n\v\f\r ");
3284     datum = cbdatumopen("", 0);
3285     lw = FALSE;
3286     for(i = 0; i < CB_LISTNUM(elems); i++){
3287     elem = CB_LISTVAL(elems, i, NULL);
3288     if(elem[0] == '\0') continue;
3289     if(!strcmp(elem, ESTOPUNION)){
3290     if(CB_DATUMSIZE(datum) < 1) continue;
3291     if(lw) cbdatumcat(datum, "\t", -1);
3292     lw = FALSE;
3293     } else if(!strcmp(elem, ESTOPISECT) || !strcmp(elem, ESTOPDIFF)){
3294     if(CB_DATUMSIZE(datum) < 1) continue;
3295     cblistpush(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
3296     cbdatumsetsize(datum, 0);
3297     cblistpush(terms, elem, -1);
3298     lw = FALSE;
3299     } else {
3300     if(CB_DATUMSIZE(datum) > 0 && lw) cbdatumcat(datum, " ", 1);
3301     cbdatumcat(datum, elem, -1);
3302     lw = TRUE;
3303     }
3304     }
3305     if(CB_DATUMSIZE(datum) > 0) cblistpush(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
3306     cbdatumclose(datum);
3307     cblistclose(elems);
3308     free(pbuf);
3309     free(tbuf);
3310     for(i = 0; i < CB_LISTNUM(terms); i++){
3311     elem = CB_LISTVAL(terms, i, NULL);
3312     if(!strcmp(elem, ESTOPUVSET) || !strcmp(elem, ESTOPISECT) ||
3313     !strcmp(elem, ESTOPDIFF)) continue;
3314     tbuf = est_uconv_in(elem, strlen(elem), &tsiz);
3315     est_canonicalize_text((unsigned char *)tbuf, tsiz, TRUE);
3316     pbuf = est_uconv_out(tbuf, tsiz, &psiz);
3317     cbstrtrim(pbuf);
3318     cblistover(terms, i, pbuf, -1);
3319     free(pbuf);
3320     free(tbuf);
3321     }
3322     for(i = CB_LISTNUM(terms) - 1; i >= 0; i--){
3323     elem = CB_LISTVAL(terms, i, NULL);
3324     if(strcmp(elem, ESTOPISECT) && strcmp(elem, ESTOPDIFF)) break;
3325     free(cblistpop(terms, NULL));
3326     }
3327     return terms;
3328     }
3329    
3330    
3331     /* Compare two scores by each ID.
3332     `ap' specifies the pointer to one score.
3333     `bp' specifies the pointer to the other score.
3334     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3335     static int est_score_compare_by_id(const void *ap, const void *bp){
3336     assert(ap && bp);
3337     return ((ESTSCORE *)ap)->id - ((ESTSCORE *)bp)->id;
3338     }
3339    
3340    
3341     /* Compare two scores by each score point.
3342     `ap' specifies the pointer to one score.
3343     `bp' specifies the pointer to the other score.
3344     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3345     static int est_score_compare_by_score(const void *ap, const void *bp){
3346     assert(ap && bp);
3347     return ((ESTSCORE *)bp)->score - ((ESTSCORE *)ap)->score;
3348     }
3349    
3350    
3351     /* Compare two scores by attributes of strings for ascending order.
3352     `ap' specifies the pointer to one score.
3353     `bp' specifies the pointer to the other score.
3354     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3355     static int est_score_compare_by_str_asc(const void *ap, const void *bp){
3356     assert(ap && bp);
3357     return strcmp(((ESTSCORE *)ap)->value, ((ESTSCORE *)bp)->value);
3358     }
3359    
3360    
3361     /* Compare two scores by attributes of strings for descending order.
3362     `ap' specifies the pointer to one score.
3363     `bp' specifies the pointer to the other score.
3364     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3365     static int est_score_compare_by_str_desc(const void *ap, const void *bp){
3366     assert(ap && bp);
3367     return strcmp(((ESTSCORE *)bp)->value, ((ESTSCORE *)ap)->value);
3368     }
3369    
3370    
3371     /* Compare two scores by attributes of numbers for ascending order.
3372     `ap' specifies the pointer to one score.
3373     `bp' specifies the pointer to the other score.
3374     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3375     static int est_score_compare_by_num_asc(const void *ap, const void *bp){
3376     assert(ap && bp);
3377     return (time_t)((ESTSCORE *)ap)->value - (time_t)((ESTSCORE *)bp)->value;
3378     }
3379    
3380    
3381     /* Compare two scores by attributes of numbers for descending order.
3382     `ap' specifies the pointer to one score.
3383     `bp' specifies the pointer to the other score.
3384     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3385     static int est_score_compare_by_num_desc(const void *ap, const void *bp){
3386     assert(ap && bp);
3387     return (time_t)((ESTSCORE *)bp)->value - (time_t)((ESTSCORE *)ap)->value;
3388     }
3389    
3390    
3391     /* Get the universal set of documents in a database.
3392     `db' specifies a database object.
3393     `nump' specifies the pointer to which the number of elements in the result is assigned.
3394     `hints' specifies a list object. If it is `NULL', it is not used.
3395     `add' specifies whether the result to be treated in union or difference.
3396     The return value is an array whose elements are ID numbers of corresponding documents. */
3397     static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add){
3398     ESTSCORE *scores;
3399     char *vbuf, numbuf[ESTNUMBUFSIZ];
3400     int snum, smax;
3401     assert(db && nump);
3402     smax = ESTALLOCUNIT;
3403     CB_MALLOC(scores, smax * sizeof(ESTSCORE));
3404     snum = 0;
3405     vlcurfirst(db->listdb);
3406     while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
3407     if(snum >= smax){
3408     smax *= 2;
3409     CB_REALLOC(scores, smax * sizeof(ESTSCORE));
3410     }
3411     scores[snum].id = atoi(vbuf);
3412     scores[snum].score = 0;
3413     snum++;
3414     free(vbuf);
3415     vlcurnext(db->listdb);
3416     }
3417     *nump = snum;
3418     if(hints){
3419     sprintf(numbuf, "%d", snum * (add ? 1 : -1));
3420     cbmapput(hints, ESTOPUVSET, -1, numbuf, -1, FALSE);
3421     }
3422     return scores;
3423     }
3424    
3425    
3426     /* Expand a word to words which begins with it.
3427     `db' specifies a database object.
3428     `word' specifies a word.
3429     `list' specifies a list object to contain the results. */
3430     static void est_expand_word(ESTDB *db, const char *word, CBLIST *list){
3431     char *kbuf;
3432     int ksiz;
3433     assert(db && word && list);
3434     vlcurjump(db->fwmdb, word, -1, VL_JFORWARD);
3435     while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
3436     if(!cbstrfwmatch(kbuf, word)){
3437     free(kbuf);
3438     break;
3439     }
3440     cblistpushbuf(list, kbuf, ksiz);
3441     vlcurnext(db->fwmdb);
3442     }
3443     }
3444    
3445    
3446     /* Get a correspinding set of documents in a database.
3447     `db' specifies a database object.
3448     `term' specifies a union term.
3449     `gstep' specifies number of steps of N-gram.
3450     `nump' specifies the pointer to which the number of elements in the result is assigned.
3451     `hints' specifies a list object. If it is `NULL', it is not used.
3452     `add' specifies whether the result to be treated in union or difference.
3453     The return value is an array whose elements are ID numbers of corresponding documents. */
3454     static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
3455     int *nump, CBMAP *hints, int add){
3456     ESTSCORE *scores, *tscores;
3457     CBLIST *words, *grams;
3458     const char *word, *gram, *rp, *fnext, *snext, *cbuf;
3459     char *vbuf, numbuf[ESTNUMBUFSIZ];
3460     int i, j, k, snum, smax, single, tsmax, tsnum, vsiz, gcnum, gsiz, csiz, wgstep, nnum;
3461     int mfsiz, mssiz, mfhash, mshash, tfhash, tshash, id, score, hit, hnum;
3462     assert(db && term && gstep > 0 && nump);
3463     smax = ESTALLOCUNIT;
3464     CB_MALLOC(scores, smax * sizeof(ESTSCORE));
3465     snum = 0;
3466     words = cbsplit(term, -1, "\t");
3467     for(i = 0; i < CB_LISTNUM(words); i++){
3468     word = CB_LISTVAL(words, i, NULL);
3469     grams = cblistopen();
3470     switch(db->amode){
3471     case ESTAMPERFNG:
3472     est_break_text_perfng(word, grams, TRUE, FALSE);
3473     break;
3474     default:
3475     est_break_text(word, grams, TRUE, FALSE);
3476     break;
3477     }
3478     single = FALSE;
3479     if(CB_LISTNUM(grams) < 1){
3480     est_expand_word(db, word, grams);
3481     single = TRUE;
3482     }
3483     tsmax = ESTALLOCUNIT;
3484     CB_MALLOC(tscores, tsmax * sizeof(ESTSCORE));
3485     tsnum = 0;
3486     gcnum = 0;
3487     wgstep = CB_LISTNUM(grams) > 2 || gstep > 2 ? gstep : 1;
3488     if(((unsigned char *)word)[0] <= 0xdf && gstep <= 2) wgstep = 1;
3489     for(j = 0; j < CB_LISTNUM(grams); j += wgstep){
3490     gcnum++;
3491     gram = CB_LISTVAL2(grams, j, &gsiz);
3492     fnext = j < CB_LISTNUM(grams) - 1 ? CB_LISTVAL2(grams, j + 1, &mfsiz) : NULL;
3493     snext = j < CB_LISTNUM(grams) - 2 ? CB_LISTVAL2(grams, j + 2, &mssiz) : NULL;
3494     mfhash = fnext ? dpinnerhash(fnext, mfsiz) % ESTJHASHNUM + 1: 0xff;
3495     mshash = snext ? dpouterhash(snext, mssiz) % ESTJHASHNUM + 1: 0xff;
3496     vbuf = est_idx_get(db->idxdb, gram, gsiz, &vsiz);
3497     if((cbuf = cbmapget(db->idxcc, gram, gsiz, &csiz)) != NULL){
3498     if(vbuf){
3499     CB_REALLOC(vbuf, vsiz + csiz + 100);
3500     memcpy(vbuf + vsiz, cbuf, csiz);
3501     vsiz += csiz;
3502     } else {
3503     vbuf = cbmemdup(cbuf, csiz);
3504     vsiz = csiz;
3505     }
3506     }
3507     if(!vbuf) continue;
3508     rp = vbuf;
3509     while(rp < vbuf + vsiz){
3510     memcpy(&id, rp, sizeof(int));
3511     rp += sizeof(int);
3512     score = *(unsigned char *)rp;
3513     rp++;
3514     hit = mfhash == 0xff && mshash == 0xff;
3515     while(rp < vbuf + vsiz){
3516     tfhash = *(unsigned char *)rp;
3517     rp++;
3518     tshash = *(unsigned char *)rp;
3519     rp++;
3520     if((mfhash == 0xff || mfhash == tfhash) && (mshash == 0xff || mshash == tshash))
3521     hit = TRUE;
3522     if(*(unsigned char *)rp == 0x00){
3523     rp++;
3524     break;
3525     }
3526     }
3527     if(hit || single){
3528     if(tsnum >= tsmax){
3529     tsmax *= 2;
3530     CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
3531     }
3532     tscores[tsnum].id = id;
3533     tscores[tsnum].score = score * 100;
3534     tsnum++;
3535     }
3536     }
3537     free(vbuf);
3538     }
3539     if(gcnum > 1){
3540     qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_id);
3541     nnum = 0;
3542     for(j = 0; j < tsnum; j++){
3543     id = tscores[j].id;
3544     score = tscores[j].score;
3545     hnum = 1;
3546     for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
3547     score += tscores[k].score;
3548     hnum++;
3549     }
3550     if(hnum >= gcnum || single){
3551     tscores[nnum].id = id;
3552     tscores[nnum].score = score / hnum;
3553     nnum++;
3554     }
3555     j = k - 1;
3556     }
3557     tsnum = nnum;
3558     }
3559     if(hints){
3560     sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
3561     cbmapput(hints, word, -1, numbuf, -1, FALSE);
3562     }
3563     for(j = 0; j < tsnum; j++){
3564     if(snum >= smax){
3565     smax *= 2;
3566     CB_REALLOC(scores, smax * sizeof(ESTSCORE));
3567     }
3568     scores[snum].id = tscores[j].id;
3569     scores[snum].score = tscores[j].score;
3570     snum++;
3571     }
3572     free(tscores);
3573     cblistclose(grams);
3574     }
3575     cblistclose(words);
3576     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
3577     nnum = 0;
3578     for(i = 0; i < snum; i++){
3579     id = scores[i].id;
3580     score = scores[i].score;
3581     hnum = 1;
3582     for(j = i + 1; j < snum && scores[j].id == id; j++){
3583     score += scores[j].score;
3584     hnum++;
3585     }
3586     scores[nnum].id = id;
3587     scores[nnum].score = score / hnum;
3588     nnum++;
3589     i = j - 1;
3590     }
3591     *nump = nnum;
3592     return scores;
3593     }
3594    
3595    
3596     /* Narrow and sort scores of search candidates.
3597     `db' specifies a database object.
3598     `attrs' specifies a list object of narrowing attributes.
3599     `order' specifies an expression for sorting.
3600     `scores' specifies an array of scores of search candidates.
3601     `snum' specifies the number of the array.
3602     The return value is the new number of the array. */
3603     static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
3604     ESTSCORE *scores, int snum){
3605     ESTCATTR *list;
3606     const char *otype, *cbuf, *rp, *pv, *ibuf;
3607     unsigned char *utmp;
3608     char *oname, *wp, *mbuf, *vbuf;
3609     int i, j, k, ci, oi, anum, tsiz, nnum, csiz, msiz, miss, vsiz, num, isiz, onlen;
3610     time_t tval;
3611     assert(db && scores && snum >= 0);
3612     ci = -1;
3613     oi = -1;
3614     oname = NULL;
3615     otype = NULL;
3616     if(order){
3617     oname = cbmemdup(order, -1);
3618     cbstrtrim(oname);
3619     otype = ESTORDSTRA;
3620     if((wp = strchr(oname, ' ')) != NULL){
3621     *wp = '\0';
3622     rp = wp + 1;
3623     while(*rp == ' '){
3624     rp++;
3625     }
3626     otype = rp;
3627     }
3628     }
3629     if(attrs){
3630     anum = CB_LISTNUM(attrs);
3631     CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
3632     for(i = 0; i < anum; i++){
3633     list[i].name = NULL;
3634     list[i].oper = NULL;
3635     list[i].val = NULL;
3636     rp = CB_LISTVAL(attrs, i, NULL);
3637     while(*rp > 0 && *rp <= ' '){
3638     rp++;
3639     }
3640     if((pv = strchr(rp, ' ')) != NULL){
3641     list[i].nsiz = pv - rp;
3642     list[i].name = cbmemdup(rp, list[i].nsiz);
3643     rp = pv;
3644     while(*rp > 0 && *rp <= ' '){
3645     rp++;
3646     }
3647     if((pv = strchr(rp, ' ')) != NULL){
3648     list[i].oper = cbmemdup(rp, pv - rp);
3649     rp = pv;
3650     while(*rp > 0 && *rp <= ' '){
3651     rp++;
3652     }
3653     list[i].vsiz = strlen(rp);
3654     list[i].val = cbmemdup(rp, list[i].vsiz);
3655     } else {
3656     list[i].oper = cbmemdup(rp, -1);
3657     }
3658     } else {
3659     list[i].nsiz = strlen(rp);
3660     list[i].name = cbmemdup(rp, list[i].nsiz);
3661     }
3662     if(!list[i].oper){
3663     list[i].oper = cbmemdup("", 0);
3664     }
3665     if(!list[i].val){
3666     list[i].vsiz = 0;
3667     list[i].val = cbmemdup("", 0);
3668     }
3669     }
3670     for(i = 0; i < anum; i++){
3671     rp = list[i].oper;
3672     if(*rp == '!'){
3673     list[i].sign = FALSE;
3674     rp++;
3675     } else {
3676     list[i].sign = TRUE;
3677     }
3678     if(*rp == 'I' || *rp == 'i'){
3679     utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
3680     est_normalize_text(utmp, tsiz, &tsiz);
3681     est_canonicalize_text(utmp, tsiz, FALSE);
3682     list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
3683     free(utmp);
3684     rp++;
3685     } else {
3686     list[i].sval = NULL;
3687     list[i].ssiz = 0;
3688     }
3689     list[i].num = cbstrmktime(list[i].val);
3690     if(!cbstricmp(rp, ESTOPSTREQ)){
3691     list[i].cop = ESTOPSTREQ;
3692     } else if(!cbstricmp(rp, ESTOPSTRNE)){
3693     list[i].cop = ESTOPSTRNE;
3694     } else if(!cbstricmp(rp, ESTOPSTRINC)){
3695     list[i].cop = ESTOPSTRINC;
3696     } else if(!cbstricmp(rp, ESTOPSTRBW)){
3697     list[i].cop = ESTOPSTRBW;
3698     } else if(!cbstricmp(rp, ESTOPSTREW)){
3699     list[i].cop = ESTOPSTREW;
3700     } else if(!cbstricmp(rp, ESTOPNUMEQ)){
3701     list[i].cop = ESTOPNUMEQ;
3702     } else if(!cbstricmp(rp, ESTOPNUMNE)){
3703     list[i].cop = ESTOPNUMNE;
3704     } else if(!cbstricmp(rp, ESTOPNUMGT)){
3705     list[i].cop = ESTOPNUMGT;
3706     } else if(!cbstricmp(rp, ESTOPNUMGE)){
3707     list[i].cop = ESTOPNUMGE;
3708     } else if(!cbstricmp(rp, ESTOPNUMLT)){
3709     list[i].cop = ESTOPNUMLT;
3710     } else if(!cbstricmp(rp, ESTOPNUMLE)){
3711     list[i].cop = ESTOPNUMLE;
3712     } else {
3713     list[i].cop = NULL;
3714     }
3715     }
3716     if(db->spacc){
3717     for(i = 0; i < anum; i++){
3718     if(!strcmp(list[i].name, db->scname)){
3719     ci = i;
3720     break;
3721     }
3722     }
3723     }
3724     if(oname){
3725     for(i = 0; i < anum; i++){
3726     if(!strcmp(list[i].name, oname)){
3727     oi = i;
3728     break;
3729     }
3730     }
3731     }
3732     nnum = 0;
3733     for(i = 0; i < snum; i++){
3734     scores[i].value = NULL;
3735     if(ci >= 0){
3736     if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
3737     cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
3738     } else {
3739     cbuf = NULL;
3740     csiz = 0;
3741     }
3742     mbuf = NULL;
3743     if((cbuf && anum == 1) ||
3744     (mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
3745     miss = FALSE;
3746     for(j = 0; !miss && j < anum; j++){
3747     if(list[j].nsiz < 1) continue;
3748     if(mbuf){
3749     vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
3750     } else if(csiz != 1 || cbuf[0] != '\0'){
3751     vbuf = cbmemdup(cbuf, csiz);
3752     vsiz = csiz;
3753     } else {
3754     vbuf = NULL;
3755     }
3756     if(list[j].oper[0] == '\0'){
3757     if(!vbuf) miss = TRUE;
3758     } else {
3759     if(!vbuf){
3760     vbuf = cbmemdup("", 0);
3761     vsiz = 0;
3762     }
3763     if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
3764     list[j].sval, list[j].ssiz, list[j].num))
3765     miss = TRUE;
3766     }
3767     if(j == ci && !cbuf){
3768     if(vbuf){
3769     cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
3770     } else {
3771     cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
3772     }
3773     if(cbmaprnum(db->spacc) > db->scmnum){
3774     num = db->scmnum * 0.1 + 1;
3775     cbmapiterinit(db->spacc);
3776     for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
3777     cbmapout(db->spacc, ibuf, isiz);
3778     }
3779     }
3780     }
3781     if(j == oi){
3782     scores[i].value = vbuf;
3783     } else {
3784     free(vbuf);
3785     }
3786     }
3787     if(miss){
3788     free(scores[i].value);
3789     } else {
3790     scores[nnum++] = scores[i];
3791     }
3792     }
3793     free(mbuf);
3794     }
3795     snum = nnum;
3796     for(i = 0; i < anum; i++){
3797     free(list[i].sval);
3798     free(list[i].val);
3799     free(list[i].oper);
3800     free(list[i].name);
3801     }
3802     free(list);
3803     } else {
3804     for(i = 0; i < snum; i++){
3805     scores[i].value = NULL;
3806     }
3807     }
3808     if(oname){
3809     ci = db->spacc && !strcmp(oname, db->scname);
3810     onlen = strlen(oname);
3811     for(i = 0; i < snum; i++){
3812     if(scores[i].value) continue;
3813     if(ci && (cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
3814     cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
3815     if(csiz == 1 && cbuf[0] == '\0'){
3816     scores[i].value = cbmemdup("", 0);
3817     } else {
3818     scores[i].value = cbmemdup(cbuf, csiz);
3819     }
3820     continue;
3821     }
3822     if((mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
3823     if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
3824     if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
3825     scores[i].value = vbuf;
3826     } else {
3827     if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
3828     scores[i].value = cbmemdup("", 0);
3829     }
3830     if(ci && cbmaprnum(db->spacc) > db->scmnum){
3831     num = db->scmnum * 0.1 + 1;
3832     cbmapiterinit(db->spacc);
3833     for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
3834     cbmapout(db->spacc, ibuf, isiz);
3835     }
3836     }
3837     free(mbuf);
3838     } else {
3839     scores[i].value = cbmemdup("", 0);
3840     }
3841     }
3842     if(!cbstricmp(otype, ESTORDSTRA)){
3843     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
3844     } else if(!cbstricmp(otype, ESTORDSTRD)){
3845     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
3846     } else if(!cbstricmp(otype, ESTORDNUMA)){
3847     for(i = 0; i < snum; i++){
3848     tval = cbstrmktime(scores[i].value);
3849     free(scores[i].value);
3850     scores[i].value = (void *)tval;
3851     }
3852     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
3853     for(i = 0; i < snum; i++){
3854     scores[i].value = NULL;
3855     }
3856     } else if(!cbstricmp(otype, ESTORDNUMD)){
3857     for(i = 0; i < snum; i++){
3858     tval = cbstrmktime(scores[i].value);
3859     free(scores[i].value);
3860     scores[i].value = (void *)tval;
3861     }
3862     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
3863     for(i = 0; i < snum; i++){
3864     scores[i].value = NULL;
3865     }
3866     }
3867     for(i = 0; i < snum; i++){
3868     free(scores[i].value);
3869     }
3870     free(oname);
3871     }
3872     return snum;
3873     }
3874    
3875    
3876     /* Check whether a score matches an attribute condition.
3877     `tval' specifies the target value;
3878     `tsiz' specifies the size of the target value
3879     `oval' specifies the operation value;
3880     `osiz' specifies the size of the operation value
3881     `sval' specifies the operation value of small cases;
3882     `ssiz' specifies the size of the operation value of small cases.
3883     `onum' specifies the numeric value.
3884     The return value is true if it does match, else it is false. */
3885     static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
3886     const char *oval, int osiz, const char *sval, int ssiz, int onum){
3887     unsigned char *eval;
3888     char *cval;
3889     int csiz, esiz, hit;
3890     assert(tval && tsiz >= 0 && oval && osiz >= 0);
3891     cval = NULL;
3892     if(sval){
3893     eval = (unsigned char *)est_uconv_in(tval, tsiz, &esiz);
3894     est_normalize_text(eval, esiz, &esiz);
3895     est_canonicalize_text(eval, esiz, FALSE);
3896     cval = (char *)est_uconv_out((char *)eval, esiz, &csiz);
3897     free(eval);
3898     tval = cval;
3899     tsiz = csiz;
3900     oval = sval;
3901     osiz = ssiz;
3902     }
3903     if(cop == ESTOPSTREQ){
3904     hit = !strcmp(tval, oval);
3905     } else if(cop == ESTOPSTRNE){
3906     hit = strcmp(tval, oval) != 0;
3907     } else if(cop == ESTOPSTRINC){
3908     hit = strstr(tval, oval) != NULL;
3909     } else if(cop == ESTOPSTRBW){
3910     hit = cbstrfwmatch(tval, oval);
3911     } else if(cop == ESTOPSTREW){
3912     hit = cbstrbwmatch(tval, oval);
3913     } else if(cop == ESTOPNUMEQ){
3914     hit = cbstrmktime(tval) == onum;
3915     } else if(cop == ESTOPNUMNE){
3916     hit = cbstrmktime(tval) != onum;
3917     } else if(cop == ESTOPNUMGT){
3918     hit = cbstrmktime(tval) > onum;
3919     } else if(cop == ESTOPNUMGE){
3920     hit = cbstrmktime(tval) >= onum;
3921     } else if(cop == ESTOPNUMLT){
3922     hit = cbstrmktime(tval) < onum;
3923     } else if(cop == ESTOPNUMLE){
3924     hit = cbstrmktime(tval) <= onum;
3925     } else {
3926     hit = FALSE;
3927     }
3928     free(cval);
3929     return sign ? hit : !hit;
3930     }
3931    
3932    
3933     /* Compare two keywords by scores in descending order.
3934     `ap' specifies the pointer to one keyword.
3935     `bp' specifies the pointer to the other keyword.
3936     The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3937     static int est_keysc_compare(const void *ap, const void *bp){
3938     assert(ap && bp);
3939     return ((ESTKEYSC *)bp)->pt - ((ESTKEYSC *)ap)->pt;
3940     }
3941    
3942    
3943     /* Get a similar set of documents in a database.
3944     `db' specifies a database object.
3945     `svmap' specifies a map object of a seed vector.
3946     `nump' specifies the pointer to which the number of elements in the result is assigned.
3947     `knum' specifies the number of keywords to get candidates.
3948     `unum' specifies the number of adopted documents for a keyword.
3949     `tfidf' specifies whether to perform TF-IDF tuning.
3950     `nmin' specifies the minimum value for narrowing.
3951     The return value is an array whose elements are ID numbers of similar documents. */
3952     static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
3953     int knum, int unum, int tfidf, double nmin){
3954     ESTSCORE *scores, *tscores;
3955     CBMAP *tvmap;
3956     const char *word;
3957     int i, j, vnum, snum, tmax, tsnum, nnum, lid, *svec, *tvec;
3958     double dval;
3959     assert(db && svmap && nump && knum >= 0 && unum >= 0 && nmin >= 0.0);
3960     CB_MALLOC(scores, sizeof(ESTSCORE) * unum * knum);
3961     snum = 0;
3962     if((vnum = cbmaprnum(svmap)) < 1) vnum = 1;
3963     cbmapiterinit(svmap);
3964     tmax = unum;
3965     for(i = 0; i < knum && (word = cbmapiternext(svmap, NULL)) != NULL; i++){
3966     tscores = est_search_union(db, word, 1, &tsnum, NULL, TRUE);
3967     qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score);
3968     for(j = 0; j < tmax && j < tsnum; j++){
3969     scores[snum].id = tscores[j].id;
3970     scores[snum].score = tscores[j].score;
3971     snum++;
3972     }
3973     free(tscores);
3974     tmax -= unum / knum / 1.25;
3975     }
3976     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
3977     nnum = 0;
3978     lid = -1;
3979     CB_MALLOC(svec, vnum * sizeof(int));
3980     CB_MALLOC(tvec, vnum * sizeof(int));
3981     est_set_svec(svmap, svec, vnum);
3982     for(i = 0; i < snum; i++){
3983     if(scores[i].id != lid){
3984     tvmap = NULL;
3985     if(db->cbvec) tvmap = db->cbvec(db, scores[i].id, db->vecdata);
3986     if(!tvmap) tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
3987     if(tvmap){
3988     est_set_tvec(svmap, tvmap, tvec, vnum);
3989     if((dval = est_vec_cos(svec, tvec, vnum)) >= nmin){
3990     scores[nnum].id = scores[i].id;
3991     scores[nnum].score = (int)(dval * 10000);
3992     if(scores[nnum].score == 9999) scores[nnum].score = 10000;
3993     nnum++;
3994     }
3995     cbmapclose(tvmap);
3996     }
3997     }
3998     lid = scores[i].id;
3999     }
4000     free(tvec);
4001     free(svec);
4002     snum = nnum;
4003     *nump = snum;
4004     return scores;
4005     }
4006    
4007    
4008     /* Create a map object of a vector for similar search from a phrase.
4009     `phrase' specifies a search phrase for similar search.
4010     The return value is a map object of the seed vector. */
4011     static CBMAP *est_phrase_vector(const char *phrase){
4012     CBMAP *svmap;
4013     CBLIST *list;
4014     const char *pv, *rp;
4015     char *utext, *rtext;
4016     int i, num, len, size;
4017     svmap = cbmapopenex(ESTMINIBNUM);
4018     list = cblistopen();
4019     while(*phrase != '\0'){
4020     if(*phrase == ESTOPWITH[0] && cbstrfwmatch(phrase, ESTOPWITH)){
4021     phrase += strlen(ESTOPWITH);
4022     pv = phrase;
4023     while(*phrase != '\0'){
4024     if(*phrase <= ' ' && cbstrfwmatch(phrase + 1, ESTOPWITH)){
4025     phrase++;
4026     break;
4027     }
4028     phrase++;
4029     }
4030     cblistpush(list, pv, phrase - pv);
4031     } else {
4032     phrase++;
4033     }
4034     }
4035     for(i = 0; i < CB_LISTNUM(list); i++){
4036     pv = CB_LISTVAL(list, i, NULL);
4037     while(*pv > '\0' && *pv <= ' '){
4038     pv++;
4039     }
4040     num = strtol(pv, (char **)&rp, 10);
4041     if(rp && (len = rp - pv) > 0 && num >= 0){
4042     utext = est_uconv_in(rp, strlen(rp), &size);
4043     est_normalize_text((unsigned char *)utext, size, &size);
4044     est_canonicalize_text((unsigned char *)utext, size, FALSE);
4045     rtext = est_uconv_out(utext, size, NULL);
4046     cbstrsqzspc(rtext);
4047     if(rtext[0] != '\0') cbmapput(svmap, rtext, -1, pv, len, FALSE);
4048     free(rtext);
4049     free(utext);
4050     }
4051     }
4052     cblistclose(list);
4053     return svmap;
4054     }
4055    
4056    
4057     /* Get the target vector of a document dynamically.
4058     `db' specifies a database object.
4059     `id' specifies the ID of a document.
4060     `vnum' specifies the number of dimensions of the vector.
4061     `tfidf' specifies whether to perform TF-IDF tuning.
4062     The return value is a map object of the target vector. */
4063     static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf){
4064     ESTDOC *doc;
4065     CBMAP *tvmap;
4066     assert(db && id > 0);
4067     if(!(doc = est_db_get_doc(db, id, 0))) return NULL;
4068     tvmap = est_db_etch_doc(tfidf ? db : NULL, doc, vnum);
4069     est_doc_delete(doc);
4070     return tvmap;
4071     }
4072    
4073    
4074     /* Set a seed vector from a map object.
4075     `svmap' specifies a map object of a seed vector.
4076     `svec' specifies a vector object.
4077     `vnum' specifies the number of dimensions of the vector. */
4078     static void est_set_svec(CBMAP *svmap, int *svec, int vnum){
4079     const char *kbuf;
4080     int i, ksiz;
4081     assert(svmap && svec && vnum > 0);
4082     cbmapiterinit(svmap);
4083     for(i = 0; i < vnum; i++){
4084     if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
4085     svec[i] = atoi(cbmapget(svmap, kbuf, ksiz, NULL));
4086     } else {
4087     svec[i] = 0;
4088     }
4089     }
4090     }
4091    
4092    
4093     /* Set a target vector from a map object.
4094     `svmap' specifies a map object of a seed vector.
4095     `tvmap' specifies a map object of a target vector.
4096     `tvec' specifies a vector object.
4097     `vnum' specifies the number of dimensions of the vector. */
4098     static void est_set_tvec(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum){
4099     const char *kbuf, *vbuf;
4100     int i, ksiz;
4101     assert(svmap && tvmap && tvec && vnum > 0);
4102     cbmapiterinit(svmap);
4103     for(i = 0; i < vnum; i++){
4104     if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
4105     vbuf = cbmapget(tvmap, kbuf, ksiz, NULL);
4106     tvec[i] = vbuf ? atoi(vbuf) : 0;
4107     } else {
4108     tvec[i] = 0;
4109     }
4110     }
4111     }
4112    
4113    
4114     /* Get the absolute of a vector.
4115     `vec' specifies a vector object.
4116     `vnum' specifies the number of dimensions of the vector.
4117     The return value is the absolute of the vector. */
4118     static double est_vec_abs(const int *vec, int vnum){
4119     double rv;
4120     int i;
4121     assert(vec && vnum >= 0);
4122     rv = 0;
4123     for(i = 0; i < vnum; i++){
4124     rv += (double)vec[i] * (double)vec[i];
4125     }
4126     return sqrt(rv);
4127     }
4128    
4129    
4130     /* Get the inner product of two vectors.
4131     `avec' specifies a vector object.
4132     `bvec' specifies the other vector object.
4133     `vnum' specifies the number of dimensions of the vector.
4134     The return value is the inner product of two vectors. */
4135     static double est_vec_iprod(const int *avec, const int *bvec, int vnum){
4136     double rv;
4137     int i;
4138     assert(avec && bvec && vnum >= 0);
4139     rv = 0;
4140     for(i = 0; i < vnum; i++){
4141     rv += (double)avec[i] * (double)bvec[i];
4142     }
4143     return rv;
4144     }
4145    
4146    
4147     /* Get the cosine of the angle of two vectors.
4148     `avec' specifies a vector object.
4149     `bvec' specifies the other vector object.
4150     `vnum' specifies the number of dimensions of the vector.
4151     The return value is the cosine of the angle of two vectors. */
4152     static double est_vec_cos(const int *avec, const int *bvec, int vnum){
4153     double rv;
4154     assert(avec && bvec && vnum >= 0);
4155     rv = est_vec_iprod(avec, bvec, vnum) /
4156     ((est_vec_abs(avec, vnum) * est_vec_abs(bvec, vnum)));
4157     return rv > 0.0 ? rv : 0.0;
4158     }
4159    
4160    
4161     /* Close the handle to the file of random number generator. */
4162     static void est_random_fclose(void){
4163     if(est_random_ifp) fclose(est_random_ifp);
4164     }
4165    
4166    
4167    
4168     /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26