1 |
/************************************************************************************************* |
2 |
* The core API of Hyper Estraier |
3 |
* Copyright (C) 2004-2005 Mikio Hirabayashi |
4 |
* This file is part of Hyper Estraier. |
5 |
* Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of |
6 |
* the GNU Lesser General Public License as published by the Free Software Foundation; either |
7 |
* version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope |
8 |
* that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
10 |
* License for more details. |
11 |
* You should have received a copy of the GNU Lesser General Public License along with Hyper |
12 |
* Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
13 |
* Boston, MA 02111-1307 USA. |
14 |
*************************************************************************************************/ |
15 |
|
16 |
|
17 |
#ifndef _ESTRAIER_H /* duplication check */ |
18 |
#define _ESTRAIER_H |
19 |
|
20 |
#if defined(__cplusplus) /* export for C++ */ |
21 |
extern "C" { |
22 |
#endif |
23 |
|
24 |
|
25 |
|
26 |
/************************************************************************************************* |
27 |
* common settings |
28 |
*************************************************************************************************/ |
29 |
|
30 |
|
31 |
/* version of QDBM */ |
32 |
extern const char *est_version; |
33 |
|
34 |
|
35 |
|
36 |
/************************************************************************************************* |
37 |
* underlying headers |
38 |
*************************************************************************************************/ |
39 |
|
40 |
|
41 |
#include <depot.h> |
42 |
#include <curia.h> |
43 |
#include <cabin.h> |
44 |
#include <villa.h> |
45 |
#include <stdlib.h> |
46 |
|
47 |
|
48 |
|
49 |
/************************************************************************************************* |
50 |
* API for document |
51 |
*************************************************************************************************/ |
52 |
|
53 |
|
54 |
#define ESTDATTRID "@id" /* name of the attribute of ID */ |
55 |
#define ESTDATTRURI "@uri" /* name of the attribute of URI */ |
56 |
#define ESTDATTRCDATE "@cdate" /* name of the attribute of creation date */ |
57 |
#define ESTDATTRMDATE "@mdate" /* name of the attribute of modification date */ |
58 |
#define ESTDATTRTITLE "@title" /* name of the attribute of title */ |
59 |
#define ESTDATTRAUTHOR "@author" /* name of the attribute of author */ |
60 |
#define ESTDATTRTYPE "@type" /* name of the attribute of content type */ |
61 |
#define ESTDATTRLANG "@lang" /* name of the attribute of language */ |
62 |
#define ESTDATTRSIZE "@size" /* name of the attribute of entity size */ |
63 |
|
64 |
typedef struct { /* type of structure for a document */ |
65 |
int id; /* identification number */ |
66 |
CBMAP *attrs; /* map of attributes */ |
67 |
CBLIST *dtexts; /* list of shown text */ |
68 |
} ESTDOC; |
69 |
|
70 |
|
71 |
/* Create a document object. |
72 |
The return value is an object of a document. */ |
73 |
ESTDOC *est_doc_new(void); |
74 |
|
75 |
|
76 |
/* Create a document object made from draft data. |
77 |
`draft' specifies a string of draft data. |
78 |
The return value is an object of a document. */ |
79 |
ESTDOC *est_doc_new_from_draft(const char *draft); |
80 |
|
81 |
|
82 |
/* Destroy a document object. |
83 |
`doc' specifies a document object. */ |
84 |
void est_doc_delete(ESTDOC *doc); |
85 |
|
86 |
|
87 |
/* Add an attribute to a document object. |
88 |
`doc' specifies a document object. |
89 |
`name' specifies the name of an attribute. |
90 |
`value' specifies the value of the attribute. If it is `NULL', the attribute is removed. */ |
91 |
void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value); |
92 |
|
93 |
|
94 |
/* Add a sentence of text to a document object. |
95 |
`doc' specifies a document object. |
96 |
`text' specifies a sentence of text. */ |
97 |
void est_doc_add_text(ESTDOC *doc, const char *text); |
98 |
|
99 |
|
100 |
/* Add a hidden sentence to a document object. |
101 |
`doc' specifies a document object. |
102 |
`text' specifies a hidden sentence. */ |
103 |
void est_doc_add_hidden_text(ESTDOC *doc, const char *text); |
104 |
|
105 |
|
106 |
/* Get the ID number of a document object. |
107 |
`doc' specifies a document object. |
108 |
The return value is the ID number of the document object. If the object has not been |
109 |
registered, -1 is returned. */ |
110 |
int est_doc_id(ESTDOC *doc); |
111 |
|
112 |
|
113 |
/* Get a list of attribute names of a document object. |
114 |
`doc' specifies a document object. |
115 |
The return value is a new list object of attribute names of the document object. Because |
116 |
the object of the return value is opened with the function `cblistopen', it should be closed |
117 |
with the function `cblistclose' if it is no longer in use. */ |
118 |
CBLIST *est_doc_attr_names(ESTDOC *doc); |
119 |
|
120 |
|
121 |
/* Get the value of an attribute of a document object. |
122 |
`doc' specifies a document object. |
123 |
`name' specifies the name of an attribute. |
124 |
The return value is the value of the attribute or `NULL' if it does not exist. The life |
125 |
duration of the returned string is synchronous with the one of the document object. */ |
126 |
const char *est_doc_attr(ESTDOC *doc, const char *name); |
127 |
|
128 |
|
129 |
/* Get a list of sentences of the text of a document object. |
130 |
`doc' specifies a document object. |
131 |
The return value is a list object of sentences of the text of the document object. The life |
132 |
duration of the returned object is synchronous with the one of the document object. */ |
133 |
const CBLIST *est_doc_texts(ESTDOC *doc); |
134 |
|
135 |
|
136 |
/* Concatenate sentences of the text of a document object. |
137 |
`doc' specifies a document object. |
138 |
The return value is concatenated sentences of the document object. Because the region of the |
139 |
return value is allocated with the `malloc' call, it should be released with the `free' call |
140 |
if it is no longer in use. */ |
141 |
char *est_doc_cat_texts(ESTDOC *doc); |
142 |
|
143 |
|
144 |
/* Dump draft data of a document object. |
145 |
`doc' specifies a document object. |
146 |
The return value is draft data of the document object. Because the region of the return value |
147 |
is allocated with the `malloc' call, it should be released with the `free' call if it is no |
148 |
longer in use. */ |
149 |
char *est_doc_dump_draft(ESTDOC *doc); |
150 |
|
151 |
|
152 |
/* Make a snippet of the body text of a document object. |
153 |
`doc' specifies a document object. |
154 |
`word' specifies a list object of words to be highlight. |
155 |
`wwitdh' specifies whole width of the result. |
156 |
`hwitdh' specifies width of strings picked up from the beginning of the text. |
157 |
`awitdh' specifies width of strings picked up around each highlighted word. |
158 |
The return value is a snippet string of the body text of the document object. There are tab |
159 |
separated values. Each line is a string to be shown. Though most lines have only one field, |
160 |
some lines have two fields. If the second field exists, the first field is to be shown with |
161 |
highlighted, and the second field means its normalized form. Because the region of the |
162 |
return value is allocated with the `malloc' call, it should be released with the `free' call |
163 |
if it is no longer in use. */ |
164 |
char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth); |
165 |
|
166 |
|
167 |
/* Check whether the text of a document object includes every specified words. |
168 |
`doc' specifies a document object. |
169 |
`word' specifies a list object of words to be checked. |
170 |
The return value is true if every specified words is found, else it is false. */ |
171 |
int est_doc_scan_words(ESTDOC *doc, const CBLIST *words); |
172 |
|
173 |
|
174 |
|
175 |
/************************************************************************************************* |
176 |
* API for search conditions |
177 |
*************************************************************************************************/ |
178 |
|
179 |
|
180 |
#define ESTOPUVSET "[UVSET]" /* universal set */ |
181 |
#define ESTOPSIMILAR "[SIMILAR]" /* similarity search */ |
182 |
|
183 |
#define ESTOPUNION "OR" /* union (conjunction) */ |
184 |
#define ESTOPISECT "AND" /* intersection (disjunction) */ |
185 |
#define ESTOPDIFF "ANDNOT" /* difference (intersection with negation) */ |
186 |
#define ESTOPWITH "WITH" /* delimiter for elements */ |
187 |
|
188 |
#define ESTOPSTREQ "STREQ" /* string is equal */ |
189 |
#define ESTOPSTRNE "STRNE" /* string is not equal */ |
190 |
#define ESTOPSTRINC "STRINC" /* string is included in */ |
191 |
#define ESTOPSTRBW "STRBW" /* string begins with */ |
192 |
#define ESTOPSTREW "STREW" /* string ends with */ |
193 |
#define ESTOPNUMEQ "NUMEQ" /* number or date is equal */ |
194 |
#define ESTOPNUMNE "NUMNE" /* number or date is not equal */ |
195 |
#define ESTOPNUMGT "NUMGT" /* number or date is greater than */ |
196 |
#define ESTOPNUMGE "NUMGE" /* number or date is greater than or equal to */ |
197 |
#define ESTOPNUMLT "NUMLT" /* number or date is less than */ |
198 |
#define ESTOPNUMLE "NUMLE" /* number or date is less than or equal to */ |
199 |
#define ESTOPREGEX "REGEX" /* string matches regular expressions */ |
200 |
|
201 |
#define ESTORDSTRA "STRA" /* strings in ascending order */ |
202 |
#define ESTORDSTRD "STRD" /* strings in descending order */ |
203 |
#define ESTORDNUMA "NUMA" /* numbers in ascending order */ |
204 |
#define ESTORDNUMD "NUMD" /* numbers in descending order */ |
205 |
|
206 |
typedef struct { /* type of structure for search conditions */ |
207 |
char *phrase; /* search phrase */ |
208 |
int gstep; /* step of N-gram */ |
209 |
int tfidf; /* whether with TF-IDF tuning */ |
210 |
int simple; /* whether with the simplified phrase */ |
211 |
CBLIST *attrs; /* conditions with attributes */ |
212 |
char *order; /* sorting order */ |
213 |
int max; /* maximum number of retrieval */ |
214 |
int scfb; /* whether to feed back scores */ |
215 |
int *scores; /* array of scores */ |
216 |
int snum; /* number of elemnts of the score array */ |
217 |
int opts; /* options for preservation */ |
218 |
} ESTCOND; |
219 |
|
220 |
enum { /* enumeration for options */ |
221 |
ESTCONDSURE = 1 << 0, /* check every N-gram key */ |
222 |
ESTCONDUSU = 1 << 1, /* check N-gram keys skipping by one */ |
223 |
ESTCONDFAST = 1 << 2, /* check N-gram keys skipping by two */ |
224 |
ESTCONDAGIT = 1 << 3, /* check N-gram keys skipping by three */ |
225 |
ESTCONDNOIDF = 1 << 4, /* without TF-IDF tuning */ |
226 |
ESTCONDSIMPLE = 1 << 10, /* with the simplified phrase */ |
227 |
ESTCONDSCFB = 1 << 30 /* feed back scores (for debug) */ |
228 |
}; |
229 |
|
230 |
|
231 |
/* Create a condition object. |
232 |
The return value is an object of search conditions. */ |
233 |
ESTCOND *est_cond_new(void); |
234 |
|
235 |
|
236 |
/* Destroy a condition object. |
237 |
`cond' specifies a condition object. */ |
238 |
void est_cond_delete(ESTCOND *cond); |
239 |
|
240 |
|
241 |
/* Set the search phrase to a condition object. |
242 |
`cond' specifies a condition object. |
243 |
`phrase' specifies a search phrase. */ |
244 |
void est_cond_set_phrase(ESTCOND *cond, const char *phrase); |
245 |
|
246 |
|
247 |
/* Add an expression for an attribute to a condition object. |
248 |
`cond' specifies a condition object. |
249 |
`expr' specifies an expression for an attribute. */ |
250 |
void est_cond_add_attr(ESTCOND *cond, const char *expr); |
251 |
|
252 |
|
253 |
/* Set the order of a condition object. |
254 |
`cond' specifies a condition object. |
255 |
`expr' specifies an expression for the order. By default, the order is by score descending. */ |
256 |
void est_cond_set_order(ESTCOND *cond, const char *expr); |
257 |
|
258 |
|
259 |
/* Set the maximum number of retrieval of a condition object. |
260 |
`cond' specifies a condition object. |
261 |
`max' specifies the maximum number of retrieval. By default, the number of retrieval is not |
262 |
limited. */ |
263 |
void est_cond_set_max(ESTCOND *cond, int max); |
264 |
|
265 |
|
266 |
/* Set options of retrieval of a condition object. |
267 |
`cond' specifies a condition object. |
268 |
`options' specifies options: `ESTCONDSURE' specifies that it checks every N-gram key, |
269 |
`ESTCONDUSU', which is the default, specifies that it checks N-gram keys with skipping one |
270 |
key, `ESTCONDFAST' skips two keys, `ESTCONDAGIT' skips three keys, `ESTCONDNOIDF' specifies |
271 |
not to perform TF-IDF tuning, `ESTCONDSIMPLE' specifies to use simplified phrase. Each option |
272 |
can be specified at the same time by bitwise or. If keys are skipped, though search speed is |
273 |
improved, the relevance ratio grows less. */ |
274 |
void est_cond_set_options(ESTCOND *cond, int options); |
275 |
|
276 |
|
277 |
|
278 |
/************************************************************************************************* |
279 |
* API for database |
280 |
*************************************************************************************************/ |
281 |
|
282 |
|
283 |
#define ESTIDXDMAX 16 /* max number of the inverted index */ |
284 |
|
285 |
typedef struct { /* type of structure for the inverted index */ |
286 |
char *name; /* name of the database */ |
287 |
int omode; /* open mode */ |
288 |
VILLA *dbs[ESTIDXDMAX]; /* database handles */ |
289 |
int dnum; /* number of division */ |
290 |
VILLA *cdb; /* current database handle */ |
291 |
} ESTIDX; |
292 |
|
293 |
typedef struct { /* type of structure for a database object */ |
294 |
char *name; /* name of the database */ |
295 |
DEPOT *metadb; /* handle of the meta database */ |
296 |
ESTIDX *idxdb; /* handles of the inverted indexs */ |
297 |
VILLA *fwmdb; /* handle of the database for forward matching */ |
298 |
CURIA *attrdb; /* handle of the database for attrutes */ |
299 |
CURIA *textdb; /* handle of the database for texts */ |
300 |
VILLA *listdb; /* handle of the database for document list */ |
301 |
int ecode; /* last happened error code */ |
302 |
int fatal; /* whether to have a fatal error */ |
303 |
int dseq; /* sequence for document IDs */ |
304 |
int dnum; /* number of the documents */ |
305 |
int amode; /* mode of text analyzer */ |
306 |
CBMAP *idxcc; /* cache for the inverted index */ |
307 |
size_t icsiz; /* power of the cache */ |
308 |
size_t icmax; /* max size of the cache */ |
309 |
CBMAP *outcc; /* cache for deleted documents */ |
310 |
CBMAP *keycc; /* cache for keys for TF-IDF */ |
311 |
int kcmnum; /* max number of the key cache */ |
312 |
CBMAP *attrcc; /* cache for attributes */ |
313 |
int acmnum; /* max number of the attribute cache */ |
314 |
CBMAP *textcc; /* cache for texts */ |
315 |
int tcmnum; /* max number of the text cache */ |
316 |
CBMAP *spacc; /* special cache for attributes */ |
317 |
int scmnum; /* max number of the special cache */ |
318 |
char *scname; /* name of the attribute for the special cache */ |
319 |
void (*cbinfo)(const char *); /* callback function to inform of events */ |
320 |
CBMAP *(*cbvec)(void *, int, void *); /* callback function to create a vector */ |
321 |
void *vecdata; /* arbitrary object for the vectorizer */ |
322 |
CBMAP *metacc; /* cache for meta data */ |
323 |
} ESTDB; |
324 |
|
325 |
enum { /* enumeration for error codes */ |
326 |
ESTENOERR, /* no error */ |
327 |
ESTEINVAL, /* invalid argument */ |
328 |
ESTEACCES, /* access forbidden */ |
329 |
ESTELOCK, /* lock failure */ |
330 |
ESTEDB, /* database problem */ |
331 |
ESTEIO, /* I/O problem */ |
332 |
ESTENOITEM, /* no item */ |
333 |
ESTEMISC = 9999 /* miscellaneous */ |
334 |
}; |
335 |
|
336 |
enum { /* enumeration for open modes */ |
337 |
ESTDBREADER = 1 << 0, /* open as a reader */ |
338 |
ESTDBWRITER = 1 << 1, /* open as a writer */ |
339 |
ESTDBCREAT = 1 << 2, /* a writer creating */ |
340 |
ESTDBTRUNC = 1 << 3, /* a writer truncating */ |
341 |
ESTDBNOLCK = 1 << 4, /* open without locking */ |
342 |
ESTDBLCKNB = 1 << 5, /* lock without blocking */ |
343 |
ESTDBPERFNG = 1 << 6 /* use perfect N-gram analyzer */ |
344 |
}; |
345 |
|
346 |
enum { /* enumeration for options of document registration */ |
347 |
ESTPDCLEAN = 1 << 0 /* clean up dispensable regions */ |
348 |
}; |
349 |
|
350 |
enum { /* enumeration for options of document deletion */ |
351 |
ESTODCLEAN = 1 << 0 /* clean up dispensable regions */ |
352 |
}; |
353 |
|
354 |
enum { /* enumeration for options of optimization */ |
355 |
ESTOPTNOPURGE = 1 << 0, /* omit purging dispensable region of deleted */ |
356 |
ESTOPTNODBOPT = 1 << 1 /* omit optimization of the database files */ |
357 |
}; |
358 |
|
359 |
enum { /* enumeration for options of document retrieval */ |
360 |
ESTGDNOATTR = 1 << 0, /* no attributes */ |
361 |
ESTGDNOTEXT = 1 << 1 /* no text */ |
362 |
}; |
363 |
|
364 |
|
365 |
/* Get the string of an error code. |
366 |
`ecode' specifies an error code. |
367 |
The return value is the string of the error code. */ |
368 |
const char *est_err_msg(int ecode); |
369 |
|
370 |
|
371 |
/* Open a database. |
372 |
`name' specifies the name of a database directory. |
373 |
`mode' specifies open modes: `ESTDBWRITER' as a writer, `ESTDBREADER' as a reader. If the |
374 |
mode is `ESTDBWRITER', the following may be added by bitwise or: `ESTDBCREAT', which means it |
375 |
creates a new database if not exist, `ESTDBTRUNC', which means it creates a new database |
376 |
regardless if one exists. Both of `ESTDBREADER' and `ESTDBWRITER' can be added to by |
377 |
bitwise or: `ESTDBNOLCK', which means it opens a database file without file locking, or |
378 |
`ESTDBLCKNB', which means locking is performed without blocking. If `ESTDBNOLCK' is used, |
379 |
the application is responsible for exclusion control. `ESTDBCREAT' can be added to by bitwise |
380 |
or: `ESTDBPERFNG', which means N-gram analysis is performed against European text also. |
381 |
`ecp' specifies the pointer to a variable to which the error code is assigned. |
382 |
The return value is a database object of the database or `NULL' if failure. */ |
383 |
ESTDB *est_db_open(const char *name, int omode, int *ecp); |
384 |
|
385 |
|
386 |
/* Close a database. |
387 |
`db' specifies a database object. |
388 |
`ecp' specifies the pointer to a variable to which the error code is assigned. |
389 |
The return value is true if success, else it is false. */ |
390 |
int est_db_close(ESTDB *db, int *ecp); |
391 |
|
392 |
|
393 |
/* Get the last happened error code of a database. |
394 |
`db' specifies a database object. |
395 |
The return value is the last happened error code of the database. */ |
396 |
int est_db_error(ESTDB *db); |
397 |
|
398 |
|
399 |
/* Check whether a database has a fatal error. |
400 |
`db' specifies a database object. |
401 |
The return value is true if the database has fatal erroor, else it is false. */ |
402 |
int est_db_fatal(ESTDB *db); |
403 |
|
404 |
|
405 |
/* Flush index words in the cache of a database. |
406 |
`db' specifies a database object connected as a writer. |
407 |
`max' specifies the maximum number of words to be flushed. If it not more than zero, all |
408 |
words are flushed. |
409 |
The return value is true if success, else it is false. */ |
410 |
int est_db_flush(ESTDB *db, int max); |
411 |
|
412 |
|
413 |
/* Synchronize updating contents of a database. |
414 |
`db' specifies a database object connected as a writer. |
415 |
The return value is true if success, else it is false. */ |
416 |
int est_db_sync(ESTDB *db); |
417 |
|
418 |
|
419 |
/* Optimize a database. |
420 |
`db' specifies a database object connected as a writer. |
421 |
`options' specifies options: `ESTOPTNOPURGE' to omit purging dispensable region of deleted |
422 |
documents, `ESTOPTNODBOPT' to omit optimization of the database files. The two can be |
423 |
specified at the same time by bitwise or. |
424 |
The return value is true if success, else it is false. */ |
425 |
int est_db_optimize(ESTDB *db, int options); |
426 |
|
427 |
|
428 |
/* Add a document to a database. |
429 |
`db' specifies a database object connected as a writer. |
430 |
`doc' specifies a document object. The document object should have the URI attribute. |
431 |
`options' specifies options: `ESTPDCLEAN' to clean up dispensable regions of the overwritten |
432 |
document. |
433 |
The return value is true if success, else it is false. |
434 |
If the URI attribute is same with an existing document in the database, the existing one is |
435 |
deleted. */ |
436 |
int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options); |
437 |
|
438 |
|
439 |
/* Remove a document from a database. |
440 |
`db' specifies a database object connected as a writer. |
441 |
`id' specifies the ID number of a registered document. |
442 |
`options' specifies options: `ESTODCLEAN' to clean up dispensable regions of the deleted |
443 |
document. |
444 |
The return value is true if success, else it is false. */ |
445 |
int est_db_out_doc(ESTDB *db, int id, int options); |
446 |
|
447 |
|
448 |
/* Retrieve a document in a database. |
449 |
`db' specifies a database object. |
450 |
`id' specifies the ID number of a registered document. |
451 |
`options' specifies options: `ESTGDNOATTR' to ignore attributes, `ESTGDNOTEXT' to ignore |
452 |
the body text. The two can be specified at the same time by bitwise or. |
453 |
The return value is a document object. On error, `NULL' is returned. */ |
454 |
ESTDOC *est_db_get_doc(ESTDB *db, int id, int options); |
455 |
|
456 |
|
457 |
/* Retrieve the value of an attribute of a document in a database. |
458 |
`db' specifies a database object. |
459 |
`id' specifies the ID number of a registered document. |
460 |
`name' specifies the name of an attribute. |
461 |
The return value is the value of the attribute or `NULL' if it does not exist. Because the |
462 |
region of the return value is allocated with the `malloc' call, it should be released with |
463 |
the `free' call if it is no longer in use. */ |
464 |
char *est_db_get_doc_attr(ESTDB *db, int id, const char *name); |
465 |
|
466 |
|
467 |
/* Get the ID of a document specified by URI. |
468 |
`db' specifies a database object. |
469 |
`uri' specifies the URI of a registered document. |
470 |
The return value is the ID of the document. On error, -1 is returned. */ |
471 |
int est_db_uri_to_id(ESTDB *db, const char *uri); |
472 |
|
473 |
|
474 |
/* Extract keywords of a document object. |
475 |
`db' specifies a database object for TF-IDF tuning. If it is `NULL', it is not used. |
476 |
`doc' specifies a document object. |
477 |
`max' specifies the maximum number of keywords to be extracted. |
478 |
The return value is a new map object of keywords and their scores in decimal string. Because |
479 |
the object of the return value is opened with the function `cbmapopen', it should be closed |
480 |
with the function `cbmapclose' if it is no longer in use. */ |
481 |
CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max); |
482 |
|
483 |
|
484 |
/* Initialize the iterator of a database. |
485 |
`db' specifies a database object. |
486 |
The return value is true if success, else it is false. */ |
487 |
int est_db_iter_init(ESTDB *db); |
488 |
|
489 |
|
490 |
/* Get the next ID of the iterator of a database. |
491 |
`db' specifies a database object. |
492 |
The return value is the next ID. If there is no more document, 0 is returned. On error, |
493 |
-1 is returned. */ |
494 |
int est_db_iter_next(ESTDB *db); |
495 |
|
496 |
|
497 |
/* Get the name of a database. |
498 |
`db' specifies a database object. |
499 |
The return value is the name of the database. The life duration of the returned string is |
500 |
synchronous with the one of the database object. */ |
501 |
const char *est_db_name(ESTDB *db); |
502 |
|
503 |
|
504 |
/* Get the number of documents in a database. |
505 |
`db' specifies a database object. |
506 |
The return value is the number of documents in the database. */ |
507 |
int est_db_doc_num(ESTDB *db); |
508 |
|
509 |
|
510 |
/* Get the number of unique words in a database. |
511 |
`db' specifies a database object. |
512 |
The return value is the number of unique words in the database. */ |
513 |
int est_db_word_num(ESTDB *db); |
514 |
|
515 |
|
516 |
/* Get the size of a database. |
517 |
`db' specifies a database object. |
518 |
The return value is the size of the database. */ |
519 |
double est_db_size(ESTDB *db); |
520 |
|
521 |
|
522 |
/* Search documents corresponding a condition for a database. |
523 |
`db' specifies a database object. |
524 |
`cond' specifies a condition object. |
525 |
`nump' specifies the pointer to a variable to which the number of elements in the result is |
526 |
assigned. |
527 |
`hints' specifies a map object into which the number of documents corresponding to each word |
528 |
is stored. If a word is in a negative condition, the number is negative. The element whose |
529 |
key is an empty string specifies the number of whole result. If it is `NULL', it is not used. |
530 |
The return value is an array whose elements are ID numbers of corresponding documents. |
531 |
This function does never fail. Even if no document corresponds or an error occurs, an empty |
532 |
array is returned. Because the region of the return value is allocated with the `malloc' |
533 |
call, it should be released with the `free' call if it is no longer in use. */ |
534 |
int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints); |
535 |
|
536 |
|
537 |
/* Set the maximum size of the cache memory of a database. |
538 |
`db' specifies a database object. |
539 |
`size' specifies the maximum size of the index cache. By default, it is 64MB. If it is not |
540 |
more than 0, the current size is not changed. |
541 |
`anum' specifies the maximum number of cached records for document attributes. By default, it |
542 |
is 8192. If it is not more than 0, the current size is not changed. |
543 |
`tnum' specifies the maximum number of cached records for document texts. By default, it is |
544 |
1024. If it is not more than 0, the current size is not changed. */ |
545 |
void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum); |
546 |
|
547 |
|
548 |
/* Set the special cache for narrowing and sorting with document attributes. |
549 |
`db' specifies a database object. |
550 |
`name' specifies the name of a document. |
551 |
`num' specifies the maximum number of cached records. */ |
552 |
void est_db_set_special_cache(ESTDB *db, const char *name, int num); |
553 |
|
554 |
|
555 |
|
556 |
/************************************************************************************************* |
557 |
* features for experts |
558 |
*************************************************************************************************/ |
559 |
|
560 |
|
561 |
#define _EST_VERSION "0.5.3" |
562 |
#define _EST_LIBVER 200 |
563 |
#define _EST_PROTVER "0.9" |
564 |
|
565 |
enum { /* enumeration for languages */ |
566 |
ESTLANGEN, /* English */ |
567 |
ESTLANGJA, /* Japanese */ |
568 |
ESTLANGZH, /* Chinese */ |
569 |
ESTLANGKO, /* Korean */ |
570 |
ESTLANGMISC /* miscellaneous */ |
571 |
}; |
572 |
|
573 |
|
574 |
/* Break a sentence of text and extract words. |
575 |
`text' specifies a sentence of text. |
576 |
`list' specifies a list object to which extract words are added. |
577 |
`norm' specifies whether to normalize the text. |
578 |
`tail' specifies whether to pick up oddness N-gram at the end. */ |
579 |
void est_break_text(const char *text, CBLIST *list, int norm, int tail); |
580 |
|
581 |
|
582 |
/* Break a sentence of text and extract words using perfect N-gram analyzer. |
583 |
`text' specifies a sentence of text. |
584 |
`list' specifies a list object to which extract words are added. |
585 |
`norm' specifies whether to normalize the text. |
586 |
`tail' specifies whether to pick up oddness N-gram at the end. */ |
587 |
void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail); |
588 |
|
589 |
|
590 |
/* Convert the character encoding of a string. |
591 |
`ptr' specifies the pointer to a region. |
592 |
`size' specifies the size of the region. If it is negative, the size is assigned with |
593 |
`strlen(ptr)'. |
594 |
`icode' specifies the name of encoding of the input string. |
595 |
`ocode' specifies the name of encoding of the output string. |
596 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
597 |
value is assigned. If it is `NULL', it is not used. |
598 |
`mp' specifies the pointer to a variable to which the number of missing characters by failure |
599 |
of conversion is assigned. If it is `NULL', it is not used. |
600 |
If successful, the return value is the pointer to the result object, else, it is `NULL'. |
601 |
Because an additional zero code is appended at the end of the region of the return value, |
602 |
the return value can be treated as a character string. Because the region of the return |
603 |
value is allocated with the `malloc' call, it should be released with the `free' call if it |
604 |
is no longer in use. */ |
605 |
char *est_iconv(const char *ptr, int size, const char *icode, const char *ocode, |
606 |
int *sp, int *mp); |
607 |
|
608 |
|
609 |
/* Detect the encoding of a string automatically. |
610 |
`ptr' specifies the pointer to a region. |
611 |
`size' specifies the size of the region. If it is negative, the size is assigned with |
612 |
`strlen(ptr)'. |
613 |
`plang' specifies a preferred language. As for now, `ESTLANGEN', `ESTLANGJA', `ESTLANGZH', |
614 |
and `ESTLANGKO' are supported. |
615 |
The return value is the string of the encoding name of the string. */ |
616 |
const char *est_enc_name(const char *ptr, int size, int plang); |
617 |
|
618 |
|
619 |
/* Convert a UTF-8 string into UTF-16BE. |
620 |
`ptr' specifies the pointer to a region. |
621 |
`size' specifies the size of the region. |
622 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
623 |
value is assigned. |
624 |
The return value is the pointer to the result object. Because an additional zero code is |
625 |
appended at the end of the region of the return value, the return value can be treated as a |
626 |
character string. Because the region of the return value is allocated with the `malloc' call, |
627 |
it should be released with the `free' call if it is no longer in use. */ |
628 |
char *est_uconv_in(const char *ptr, int size, int *sp); |
629 |
|
630 |
|
631 |
/* Convert a UTF-16BE string into UTF-8. |
632 |
`ptr' specifies the pointer to a region. |
633 |
`size' specifies the size of the region. |
634 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
635 |
value is assigned. If it is `NULL', it is not used. |
636 |
The return value is the pointer to the result object. Because an additional zero code is |
637 |
appended at the end of the region of the return value, the return value can be treated as a |
638 |
character string. Because the region of the return value is allocated with the `malloc' call, |
639 |
it should be released with the `free' call if it is no longer in use. */ |
640 |
char *est_uconv_out(const char *ptr, int size, int *sp); |
641 |
|
642 |
|
643 |
/* Compress a serial object with ZLIB. |
644 |
`ptr' specifies the pointer to a region. |
645 |
`size' specifies the size of the region. If it is negative, the size is assigned with |
646 |
`strlen(ptr)'. |
647 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
648 |
value is assigned. |
649 |
If successful, the return value is the pointer to the result object, else, it is `NULL'. |
650 |
Because the region of the return value is allocated with the `malloc' call, it should be |
651 |
released with the `free' call if it is no longer in use. */ |
652 |
char *est_deflate(const char *ptr, int size, int *sp); |
653 |
|
654 |
|
655 |
/* Decompress a serial object compressed with ZLIB. |
656 |
`ptr' specifies the pointer to a region. |
657 |
`size' specifies the size of the region. |
658 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
659 |
value is assigned. If it is `NULL', it is not used. |
660 |
If successful, the return value is the pointer to the result object, else, it is `NULL'. |
661 |
Because an additional zero code is appended at the end of the region of the return value, |
662 |
the return value can be treated as a character string. Because the region of the return |
663 |
value is allocated with the `malloc' call, it should be released with the `free' call if it |
664 |
is no longer in use. */ |
665 |
char *est_inflate(const char *ptr, int size, int *sp); |
666 |
|
667 |
|
668 |
/* Get the border string for draft data of documents. |
669 |
The return value is the border string for draft data of documents. */ |
670 |
const char *est_border_str(void); |
671 |
|
672 |
|
673 |
/* Get the real random number. |
674 |
The return value is the real random number between 0.0 and 1.0. */ |
675 |
double est_random(void); |
676 |
|
677 |
|
678 |
/* Get the random number in normal distribution. |
679 |
The return value is the random number in normal distribution between 0.0 and 1.0. */ |
680 |
double est_random_nd(void); |
681 |
|
682 |
|
683 |
/* Get an MD5 hash string of a key string. |
684 |
`key' specifies a string to be encrypted. |
685 |
The return value is an MD5 hash string of the key string. Because the region of the return |
686 |
value is allocated with the `malloc' call, it should be released with the `free' call if it |
687 |
is no longer in use. */ |
688 |
char *est_make_crypt(const char *key); |
689 |
|
690 |
|
691 |
/* Check whether a key matches an MD5 hash string. |
692 |
`key' specifies a string to be checked. |
693 |
`hash' specifies an MD5 hash string. |
694 |
The return value is true if the key matches the hash string, else it is false. */ |
695 |
int est_match_crypt(const char *key, const char *hash); |
696 |
|
697 |
|
698 |
/* Get the hidden texts of a document object. |
699 |
`doc' specifies a document object. |
700 |
The return value is concatenated sentences of the hidden text of the document object. The |
701 |
life duration of the returned string is synchronous with the one of the document object. */ |
702 |
const char *est_doc_hidden_texts(ESTDOC *doc); |
703 |
|
704 |
|
705 |
/* Get the phrase of a condition object. |
706 |
`cond' specifies a condition object. |
707 |
The return value is the phrase of the condition object or `NULL' if it is not specified. The |
708 |
life duration of the returned string is synchronous with the one of the condition object. */ |
709 |
const char *est_cond_phrase(ESTCOND *cond); |
710 |
|
711 |
|
712 |
/* Get a list object of attribute expressions of a condition object. |
713 |
`cond' specifies a condition object. |
714 |
The return value is a list object of attribute expressions of the condition object or `NULL' if |
715 |
it is not specified. The life duration of the returned object is synchronous with the one of |
716 |
the condition object. */ |
717 |
const CBLIST *est_cond_attrs(ESTCOND *cond); |
718 |
|
719 |
|
720 |
/* Get the order expression of a condition object. |
721 |
`cond' specifies a condition object. |
722 |
The return value is the order expression of the condition object or `NULL' if it is not |
723 |
specified. The life duration of the returned string is synchronous with the one of the |
724 |
condition object. */ |
725 |
const char *est_cond_order(ESTCOND *cond); |
726 |
|
727 |
|
728 |
/* Get the maximum number of retrieval of a condition object. |
729 |
`cond' specifies a condition object. |
730 |
The return value is the maximum number of retrieval of the condition object or -1 if it is not |
731 |
specified. */ |
732 |
int est_cond_max(ESTCOND *cond); |
733 |
|
734 |
|
735 |
/* Get the options of a condition object. |
736 |
`cond' specifies a condition object. |
737 |
The return value is the options of the condition object. */ |
738 |
int est_cond_options(ESTCOND *cond); |
739 |
|
740 |
|
741 |
/* Get the score of a document corresponding to a condition object. |
742 |
`cond' specifies a condition object. |
743 |
`index' specifies the index of an element of the result array of `est_db_search'. |
744 |
The return value is the score of the element or -1 if the index is out of bounds. */ |
745 |
int est_cond_score(ESTCOND *cond, int index); |
746 |
|
747 |
|
748 |
/* Set the error code of a database. |
749 |
`db' specifies a database object. |
750 |
`ecode' specifies a error code to set. */ |
751 |
void est_db_set_ecode(ESTDB *db, int ecode); |
752 |
|
753 |
|
754 |
/* Edit attributes of a document object in a database. |
755 |
`db' specifies a database object connected as a writer. |
756 |
`doc' specifies a document object. |
757 |
The return value is true if success, else it is false. */ |
758 |
int est_db_edit_doc(ESTDB *db, ESTDOC *doc); |
759 |
|
760 |
|
761 |
/* Add a piece of meta data to a database. |
762 |
`db' specifies a database object connected as a writer. |
763 |
`name' specifies the name of a piece of meta data. |
764 |
`value' specifies the value of the meta data. If it is `NULL', the meta data is removed. */ |
765 |
void est_db_add_meta(ESTDB *db, const char *name, const char *value); |
766 |
|
767 |
|
768 |
/* Get a list of names of meta data of a database. |
769 |
`db' specifies a database object. |
770 |
The return value is a new list object of meta data names of the document object. Because the |
771 |
object of the return value is opened with the function `cblistopen', it should be closed with |
772 |
the function `cblistclose' if it is no longer in use. */ |
773 |
CBLIST *est_db_meta_names(ESTDB *db); |
774 |
|
775 |
|
776 |
/* Get the value of a piece of meta data of a database. |
777 |
`db' specifies a database object. |
778 |
`name' specifies the name of a piece of meta data. |
779 |
The return value is the value of the meta data or `NULL' if it does not exist. Because the |
780 |
region of the return value is allocated with the `malloc' call, it should be released with |
781 |
the `free' call if it is no longer in use. */ |
782 |
char *est_db_meta(ESTDB *db, const char *name); |
783 |
|
784 |
|
785 |
/* Get the number of records in the cache memory of a database. |
786 |
`db' specifies a database object. |
787 |
The return value is the cache memory of a database. */ |
788 |
int est_db_cache_num(ESTDB *db); |
789 |
|
790 |
|
791 |
/* Set the callback function to inform of database events. |
792 |
`db' specifies a database object. |
793 |
`func' specifies the pointer to a function. The argument of the callback specifies a message |
794 |
of each event. */ |
795 |
void est_db_set_informer(ESTDB *db, void (*func)(const char *)); |
796 |
|
797 |
|
798 |
/* Set the callback function to create a vector of keywords of a document. |
799 |
`db' specifies a database object. |
800 |
`func' specifies the pointer to a function. The arguments of the callback specify the |
801 |
database object, the ID of a document, and an arbitrary pointer. The return value is the |
802 |
callback is a new map object conforming to the return value of `est_db_etch_doc'. |
803 |
`data' specifies the pointer to an object given as the third argument of the callback. */ |
804 |
void est_db_set_vectorizer(ESTDB *db, CBMAP *(*func)(void *, int, void *), void *data); |
805 |
|
806 |
|
807 |
/* Fill the cache for keys for TF-IDF. |
808 |
`db' specifies a database object. */ |
809 |
void est_db_fill_key_cache(ESTDB *db); |
810 |
|
811 |
|
812 |
/* Make a directory. |
813 |
`path' specifies the path of a new directory. |
814 |
The return value is true if success, else it is false. */ |
815 |
int est_mkdir(const char *path); |
816 |
|
817 |
|
818 |
/* Remove a directory and its contents recursively. |
819 |
`path' specifies the path of a directory. |
820 |
The return value is true if success, else it is false. */ |
821 |
int est_rmdir_rec(const char *path); |
822 |
|
823 |
|
824 |
/* Get the canonicalized absolute pathname of a file. |
825 |
`path' specifies the path of a new directory. |
826 |
The return value is the canonicalized absolute pathname of a file. Because the region of the |
827 |
return value is allocated with the `malloc' call, it should be released with the `free' call |
828 |
if it is no longer in use. */ |
829 |
char *est_realpath(const char *path); |
830 |
|
831 |
|
832 |
/* Get the time of day in milliseconds. |
833 |
The return value is the time of day in milliseconds. */ |
834 |
double est_gettimeofday(void); |
835 |
|
836 |
|
837 |
/* Suspend execution for microsecond intervals. |
838 |
`usec' specifies microseconds to sleep for. */ |
839 |
void est_usleep(unsigned long usec); |
840 |
|
841 |
|
842 |
/* Send a signal to a process. |
843 |
`pid' specifies the PID of a target process. |
844 |
`sig' specifies a signal code. |
845 |
The return value is true if success, else it is false. */ |
846 |
int est_kill(int pid, int sig); |
847 |
|
848 |
|
849 |
/* Get the media type of an extention. |
850 |
`ext' specifies the extension of a file path. |
851 |
The return value is the media time of the extension. */ |
852 |
const char *est_ext_type(const char *ext); |
853 |
|
854 |
|
855 |
|
856 |
#if defined(__cplusplus) /* export for C++ */ |
857 |
} |
858 |
#endif |
859 |
|
860 |
#endif /* duplication check */ |
861 |
|
862 |
|
863 |
/* END OF FILE */ |