/[hyperestraier]/upstream/0.5.0/estraier.h
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /upstream/0.5.0/estraier.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Fri Jul 29 21:52:03 2005 UTC (18 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 39406 byte(s)
import of HyperEstraier 0.5.0

1 dpavlin 1 /*************************************************************************************************
2     * The core API of Hyper Estraier
3     * Copyright (C) 2004-2005 Mikio Hirabayashi
4     * This file is part of Hyper Estraier.
5     * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6     * the GNU Lesser General Public License as published by the Free Software Foundation; either
7     * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8     * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10     * License for more details.
11     * You should have received a copy of the GNU Lesser General Public License along with Hyper
12     * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13     * Boston, MA 02111-1307 USA.
14     *************************************************************************************************/
15    
16    
17     #ifndef _ESTRAIER_H /* duplication check */
18     #define _ESTRAIER_H
19    
20     #if defined(__cplusplus) /* export for C++ */
21     extern "C" {
22     #endif
23    
24    
25    
26     /*************************************************************************************************
27     * common settings
28     *************************************************************************************************/
29    
30    
31     /* version of QDBM */
32     extern const char *est_version;
33    
34    
35    
36     /*************************************************************************************************
37     * underlying headers
38     *************************************************************************************************/
39    
40    
41     #include <depot.h>
42     #include <curia.h>
43     #include <cabin.h>
44     #include <villa.h>
45     #include <stdlib.h>
46    
47    
48    
49     /*************************************************************************************************
50     * API for document
51     *************************************************************************************************/
52    
53    
54     #define ESTDATTRID "@id" /* name of the attribute of ID */
55     #define ESTDATTRURI "@uri" /* name of the attribute of URI */
56     #define ESTDATTRCDATE "@cdate" /* name of the attribute of creation date */
57     #define ESTDATTRMDATE "@mdate" /* name of the attribute of modification date */
58     #define ESTDATTRTITLE "@title" /* name of the attribute of title */
59     #define ESTDATTRAUTHOR "@author" /* name of the attribute of author */
60     #define ESTDATTRTYPE "@type" /* name of the attribute of content type */
61     #define ESTDATTRLANG "@lang" /* name of the attribute of language */
62     #define ESTDATTRSIZE "@size" /* name of the attribute of entity size */
63    
64     typedef struct { /* type of structure for a document */
65     int id; /* identification number */
66     CBMAP *attrs; /* map of attributes */
67     CBLIST *dtexts; /* list of shown text */
68     } ESTDOC;
69    
70    
71     /* Create a document object.
72     The return value is an object of a document. */
73     ESTDOC *est_doc_new(void);
74    
75    
76     /* Create a document object made from draft data.
77     `draft' specifies a string of draft data.
78     The return value is an object of a document. */
79     ESTDOC *est_doc_new_from_draft(const char *draft);
80    
81    
82     /* Destroy a document object.
83     `doc' specifies a document object. */
84     void est_doc_delete(ESTDOC *doc);
85    
86    
87     /* Add an attribute to a document object.
88     `doc' specifies a document object.
89     `name' specifies the name of an attribute.
90     `value' specifies the value of the attribute. If it is `NULL', the attribute is removed. */
91     void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value);
92    
93    
94     /* Add a sentence of text to a document object.
95     `doc' specifies a document object.
96     `text' specifies a sentence of text. */
97     void est_doc_add_text(ESTDOC *doc, const char *text);
98    
99    
100     /* Add a hidden sentence to a document object.
101     `doc' specifies a document object.
102     `text' specifies a hidden sentence. */
103     void est_doc_add_hidden_text(ESTDOC *doc, const char *text);
104    
105    
106     /* Get the ID number of a document object.
107     `doc' specifies a document object.
108     The return value is the ID number of the document object. If the object has never been
109     registered, -1 is returned. */
110     int est_doc_id(ESTDOC *doc);
111    
112    
113     /* Get a list of attribute names of a document object.
114     `doc' specifies a document object.
115     The return value is a new list object of attribute names of the document object. Because
116     the object of the return value is opened with the function `cblistopen', it should be closed
117     with the function `cblistclose' if it is no longer in use. */
118     CBLIST *est_doc_attr_names(ESTDOC *doc);
119    
120    
121     /* Get the value of an attribute of a document object.
122     `doc' specifies a document object.
123     `name' specifies the name of an attribute.
124     The return value is the value of the attribute or `NULL' if it does not exist. The life
125     duration of the returned string is synchronous with the one of the document object. */
126     const char *est_doc_attr(ESTDOC *doc, const char *name);
127    
128    
129     /* Get a list of sentences of the text of a document object.
130     `doc' specifies a document object.
131     The return value is a list object of sentences of the text of the document object. The life
132     duration of the returned object is synchronous with the one of the document object. */
133     const CBLIST *est_doc_texts(ESTDOC *doc);
134    
135    
136     /* Concatenate sentences of the text of a document object.
137     `doc' specifies a document object.
138     The return value is concatenated sentences of a document object. Because the region of the
139     return value is allocated with the `malloc' call, it should be released with the `free' call
140     if it is no longer in use. */
141     char *est_doc_cat_texts(ESTDOC *doc);
142    
143    
144     /* Dump draft data of a document object.
145     `doc' specifies a document object.
146     The return value is draft data of a document object. Because the region of the return value
147     is allocated with the `malloc' call, it should be released with the `free' call if it is no
148     longer in use. */
149     char *est_doc_dump_draft(ESTDOC *doc);
150    
151    
152     /* Make a snippet of the body text of a document object.
153     `doc' specifies a document object.
154     `word' specifies a list object of words to be highlight.
155     `wwitdh' specifies whole width of the result.
156     `hwitdh' specifies width of strings picked up from the beginning of the text.
157     `awitdh' specifies width of strings picked up around each highlighted word.
158     The return value is a snippet string of the body text of a document object. There are tab
159     separated values. Each line is a string to be shown. Though most lines have only one field,
160     some lines have two fields. If the second field exists, the first field is to be shown with
161     highlighted, and the second field means its normalized form. Because the region of the
162     return value is allocated with the `malloc' call, it should be released with the `free' call
163     if it is no longer in use. */
164     char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth);
165    
166    
167     /* Check whether the text of a document object includes every specified words.
168     `doc' specifies a document object.
169     `word' specifies a list object of words to be checked.
170     The return value is true if every specified words is found, else it is false. */
171     int est_doc_scan_words(ESTDOC *doc, const CBLIST *words);
172    
173    
174    
175     /*************************************************************************************************
176     * API for search conditions
177     *************************************************************************************************/
178    
179    
180     #define ESTOPUVSET "[UVSET]" /* universal set */
181     #define ESTOPSIMILAR "[SIMILAR]" /* similarity search */
182    
183     #define ESTOPUNION "OR" /* union (conjunction) */
184     #define ESTOPISECT "AND" /* intersection (disjunction) */
185     #define ESTOPDIFF "ANDNOT" /* difference (intersection with negation) */
186     #define ESTOPWITH "WITH" /* delimiter for elements */
187    
188     #define ESTOPSTREQ "STREQ" /* string is equal */
189     #define ESTOPSTRNE "STRNE" /* string is not equal */
190     #define ESTOPSTRINC "STRINC" /* string is included in */
191     #define ESTOPSTRBW "STRBW" /* string begins with */
192     #define ESTOPSTREW "STREW" /* string ends with */
193     #define ESTOPNUMEQ "NUMEQ" /* number or date is equal */
194     #define ESTOPNUMNE "NUMNE" /* number or date is not equal */
195     #define ESTOPNUMGT "NUMGT" /* number or date is greater than */
196     #define ESTOPNUMGE "NUMGE" /* number or date is greater than or equal to */
197     #define ESTOPNUMLT "NUMLT" /* number or date is less than */
198     #define ESTOPNUMLE "NUMLE" /* number or date is less than or equal to */
199     #define ESTOPREGEX "REGEX" /* string matches regular expressions */
200    
201     #define ESTORDSTRA "STRA" /* strings in ascending order */
202     #define ESTORDSTRD "STRD" /* strings in descending order */
203     #define ESTORDNUMA "NUMA" /* numbers in ascending order */
204     #define ESTORDNUMD "NUMD" /* numbers in descending order */
205    
206     typedef struct { /* type of structure for search conditions */
207     char *phrase; /* search phrase */
208     int gstep; /* step of N-gram */
209     int tfidf; /* whether with TF-IDF tuning */
210     int simple; /* whether with the simplefied phrase */
211     CBLIST *attrs; /* conditions with attributes */
212     char *order; /* sorting order */
213     int max; /* maximum number of retrieval */
214     int scfb; /* whether to feed back scores */
215     int *scores; /* array of scores */
216     int snum; /* number of elemnts of the score array */
217     int opts; /* options for preservation */
218     } ESTCOND;
219    
220     enum { /* enumeration for options */
221     ESTCONDSURE = 1 << 0, /* check every N-gram key */
222     ESTCONDUSU = 1 << 1, /* check N-gram keys skipping by one */
223     ESTCONDFAST = 1 << 2, /* check N-gram keys skipping by two */
224     ESTCONDAGIT = 1 << 3, /* check N-gram keys skipping by three */
225     ESTCONDNOIDF = 1 << 4, /* without TF-IDF tuning */
226     ESTCONDSIMPLE = 1 << 10, /* with the simplefied phrase */
227     ESTCONDSCFB = 1 << 30 /* feed back scores (for debug) */
228     };
229    
230    
231     /* Create a condition object.
232     The return value is an object of search conditions. */
233     ESTCOND *est_cond_new(void);
234    
235    
236     /* Destroy a condition object.
237     `cond' specifies a condition object. */
238     void est_cond_delete(ESTCOND *cond);
239    
240    
241     /* Set a search phrase to a condition object.
242     `cond' specifies a condition object.
243     `phrase' specifies a search phrase. */
244     void est_cond_set_phrase(ESTCOND *cond, const char *phrase);
245    
246    
247     /* Add an expression for an attribute to a condition object.
248     `cond' specifies a condition object.
249     `expr' specifies an expression for an attribute. */
250     void est_cond_add_attr(ESTCOND *cond, const char *expr);
251    
252    
253     /* Set the order of a condition object.
254     `cond' specifies a condition object.
255     `expr' specifies an expression for the order.
256     By default, the order is by score descending. */
257     void est_cond_set_order(ESTCOND *cond, const char *expr);
258    
259    
260     /* Set the maximum number of retrieval of a condition object.
261     `cond' specifies a condition object.
262     `max' specifies the maximum number of retrieval.
263     By default, the number of retrieval is not limited. */
264     void est_cond_set_max(ESTCOND *cond, int max);
265    
266    
267     /* Set options of retrieval of a condition object.
268     `cond' specifies a condition object.
269     `options' specifies options: `ESTCONDSURE' specifies that it checks every N-gram key,
270     `ESTCONDUSU', which is the default, specifies that it checks N-gram keys with skipping one
271     key, `ESTCONDFAST' skips two keys, `ESTCONDAGIT' skips three keys, `ESTCONDNOIDF' specifies
272     not to perform TF-IDF tuning, `ESTCONDSIMPLE' specifies to use simplefied phrase. Each option
273     can be specified at the same time by bitwise or. If keys are skipped, though search speed is
274     improved, the relevance ratio grows less. */
275     void est_cond_set_options(ESTCOND *cond, int options);
276    
277    
278    
279     /*************************************************************************************************
280     * API for database
281     *************************************************************************************************/
282    
283    
284     #define ESTIDXDMAX 16 /* max number of the inverted index */
285    
286     typedef struct { /* type of structure for the inverted index */
287     char *name; /* name of the database */
288     int omode; /* open mode */
289     VILLA *dbs[ESTIDXDMAX]; /* database handles */
290     int dnum; /* number of division */
291     VILLA *cdb; /* current database handle */
292     } ESTIDX;
293    
294     typedef struct { /* type of structure for a database object */
295     char *name; /* name of the database */
296     DEPOT *metadb; /* handle of the meta database */
297     ESTIDX *idxdb; /* handles of the inverted indexs */
298     VILLA *fwmdb; /* handle of the database for forward matching */
299     CURIA *attrdb; /* handle of the database for attrutes */
300     CURIA *textdb; /* handle of the database for texts */
301     VILLA *listdb; /* handle of the database for document list */
302     int ecode; /* last happened error code */
303     int fatal; /* whether to have a fatal error */
304     int dseq; /* sequence for document IDs */
305     int dnum; /* number of the documents */
306     int amode; /* mode of text analyzer */
307     CBMAP *idxcc; /* cache for the inverted index */
308     size_t icsiz; /* power of the cache */
309     size_t icmax; /* max size of the cache */
310     CBMAP *outcc; /* cache for deleted documents */
311     CBMAP *keycc; /* cache for keys for TF-IDF */
312     int kcmnum; /* max number of the key cache */
313     CBMAP *attrcc; /* cache for attributes */
314     int acmnum; /* max number of the attribute cache */
315     CBMAP *textcc; /* cache for texts */
316     int tcmnum; /* max number of the text cache */
317     CBMAP *spacc; /* special cache for attributes */
318     int scmnum; /* max number of the special cache */
319     char *scname; /* name of the attribute for the special cache */
320     void (*cbinfo)(const char *); /* callback function to inform of events */
321     CBMAP *(*cbvec)(void *, int, void *); /* callback function to create a vector */
322     void *vecdata; /* arbitrary object for the vectorizer */
323     CBMAP *metacc; /* cache for meta data */
324     } ESTDB;
325    
326     enum { /* enumeration for error codes */
327     ESTENOERR, /* no error */
328     ESTEINVAL, /* invalid argument */
329     ESTEACCES, /* access forbidden */
330     ESTELOCK, /* lock failure */
331     ESTEDB, /* database problem */
332     ESTEIO, /* I/O problem */
333     ESTENOITEM, /* no item */
334     ESTEMISC = 9999 /* miscellaneous */
335     };
336    
337     enum { /* enumeration for open modes */
338     ESTDBREADER = 1 << 0, /* open as a reader */
339     ESTDBWRITER = 1 << 1, /* open as a writer */
340     ESTDBCREAT = 1 << 2, /* a writer creating */
341     ESTDBTRUNC = 1 << 3, /* a writer truncating */
342     ESTDBNOLCK = 1 << 4, /* open without locking */
343     ESTDBLCKNB = 1 << 5, /* lock without blocking */
344     ESTDBPERFNG = 1 << 6 /* use perfect N-gram analyzer */
345     };
346    
347     enum { /* enumeration for options of document registration */
348     ESTPDCLEAN = 1 << 0 /* clean up dispensable regions */
349     };
350    
351     enum { /* enumeration for options of document deletion */
352     ESTODCLEAN = 1 << 0 /* clean up dispensable regions */
353     };
354    
355     enum { /* enumeration for options of optimization */
356     ESTOPTNOPURGE = 1 << 0, /* omit purging dispensable region of deleted */
357     ESTOPTNODBOPT = 1 << 1 /* omit optimizization of the database files */
358     };
359    
360     enum { /* enumeration for options of document retrieval */
361     ESTGDNOATTR = 1 << 0, /* no attributes */
362     ESTGDNOTEXT = 1 << 1 /* no text */
363     };
364    
365    
366     /* Get the string of an error code.
367     `ecode' specifies an error code.
368     The return value is the string of the error code. */
369     const char *est_err_msg(int ecode);
370    
371    
372     /* Open a database.
373     `name' specifies the name of a database directory.
374     `mode' specifies open modes: `ESTDBWRITER' as a writer, `ESTDBREADER' as a reader. If the
375     mode is `ESTDBWRITER', the following may be added by bitwise or: `ESTDBCREAT', which means it
376     creates a new database if not exist, `ESTDBTRUNC', which means it creates a new database
377     regardless if one exists. Both of `ESTDBREADER' and `ESTDBWRITER' can be added to by
378     bitwise or: `ESTDBNOLCK', which means it opens a database file without file locking, or
379     `ESTDBLCKNB', which means locking is performed without blocking. If `ESTDBNOLCK' is used,
380     the application is responsible for exclusion control. `ESTDBCREAT' can be added to by bitwise
381     or: `ESTDBPERFNG', which means N-gram analysis is performed against Europian text also.
382     `ecp' specifies the pointer to a variable to which the error code is assigned.
383     The return value is a database object of the database or `NULL' if failure. */
384     ESTDB *est_db_open(const char *name, int omode, int *ecp);
385    
386    
387     /* Close a database.
388     `db' specifies a database object.
389     `ecp' specifies the pointer to a variable to which the error code is assigned.
390     The return value is true if success, else it is false. */
391     int est_db_close(ESTDB *db, int *ecp);
392    
393    
394     /* Get the last happended error code of a database.
395     `db' specifies a database object.
396     The return value is the last happended error code of the database. */
397     int est_db_error(ESTDB *db);
398    
399    
400     /* Check whether a database has a fatal error.
401     `db' specifies a database object.
402     The return value is true if the database has fatal erroor, else it is false. */
403     int est_db_fatal(ESTDB *db);
404    
405    
406     /* Flush index words in the cache of a database.
407     `db' specifies a database object connected as a writer.
408     `max' specifies the maximum number of words to be flushed. If it not more than zero, all
409     words are flushed.
410     The return value is true if success, else it is false. */
411     int est_db_flush(ESTDB *db, int max);
412    
413    
414     /* Synchronize updating contents of a database.
415     `db' specifies a database object connected as a writer.
416     The return value is true if success, else it is false. */
417     int est_db_sync(ESTDB *db);
418    
419    
420     /* Optimize a database.
421     `db' specifies a database object connected as a writer.
422     `options' specifies options: `ESTOPTNOPURGE' to omit purging dispensable region of deleted
423     documents, `ESTOPTNODBOPT' to omit optimizization of the database files. The two can be
424     specified at the same time by bitwise or.
425     The return value is true if success, else it is false. */
426     int est_db_optimize(ESTDB *db, int options);
427    
428    
429     /* Add a document to a database.
430     `db' specifies a database object connected as a writer.
431     `doc' specifies a document object. The document object should have the URI attribute.
432     `options' specifies options: `ESTPDCLEAN' to clean up dispensable regions of the overwritten
433     document.
434     The return value is true if success, else it is false.
435     If the URI attribute is same with an existing document in the database, the existing one is
436     deleted. */
437     int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options);
438    
439    
440     /* Remove a document from a database.
441     `db' specifies a database object connected as a writer.
442     `id' specifies the ID number of a registered document.
443     `options' specifies options: `ESTODCLEAN' to clean up dispensable regions of the deleted
444     document.
445     The return value is true if success, else it is false. */
446     int est_db_out_doc(ESTDB *db, int id, int options);
447    
448    
449     /* Retrieve a document in a database.
450     `db' specifies a database object.
451     `id' specifies the ID number of a registered document.
452     `options' specifies options: `ESTGDNOATTR' to ignore attributes, `ESTGDNOTEXT' to ignore
453     the body text. The two can be specified at the same time by bitwise or.
454     The return value is a document object. On error, `NULL' is returned. */
455     ESTDOC *est_db_get_doc(ESTDB *db, int id, int options);
456    
457    
458     /* Retrieve the value of an attribute of a document in a database.
459     `db' specifies a database object.
460     `id' specifies the ID number of a registered document.
461     `name' specifies the name of an attribute.
462     The return value is the value of the attribute or `NULL' if it does not exist. Because the
463     region of the return value is allocated with the `malloc' call, it should be released with
464     the `free' call if it is no longer in use. */
465     char *est_db_get_doc_attr(ESTDB *db, int id, const char *name);
466    
467    
468     /* Get the ID of a document spacified by URI.
469     `db' specifies a database object.
470     `uri' specifies the URI of a registered document.
471     The return value is the ID of the document. On error, -1 is returned. */
472     int est_db_uri_to_id(ESTDB *db, const char *uri);
473    
474    
475     /* Extract keywords of a document object.
476     `db' specifies a database object for TF-IDF tuning. If it is `NULL', it is not used.
477     `doc' specifies a document object.
478     `max' specifies the maximum number of keywords to be extracted.
479     The return value is a new map object of keywords and their scores in decimal string. Because
480     the object of the return value is opened with the function `cbmapopen', it should be closed
481     with the function `cbmapclose' if it is no longer in use. */
482     CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max);
483    
484    
485     /* Initialize the iterator of a database.
486     `db' specifies a database object.
487     The return value is true if success, else it is false. */
488     int est_db_iter_init(ESTDB *db);
489    
490    
491     /* Get the next ID of the iterator of a database.
492     `db' specifies a database object.
493     The return value is the next ID. If there is no more document, 0 is returned. On error,
494     -1 is returned. */
495     int est_db_iter_next(ESTDB *db);
496    
497    
498     /* Get the name of a database.
499     `db' specifies a database object.
500     The return value is the name of the database. The life duration of the returned string is
501     synchronous with the one of the database object. */
502     const char *est_db_name(ESTDB *db);
503    
504    
505     /* Get the number of documents in a database.
506     `db' specifies a database object.
507     The return value is the number of documents in the database. */
508     int est_db_doc_num(ESTDB *db);
509    
510    
511     /* Get the number of unique words in a database.
512     `db' specifies a database object.
513     The return value is the number of unique words in the database. */
514     int est_db_word_num(ESTDB *db);
515    
516    
517     /* Get the size of a database.
518     `db' specifies a database object.
519     The return value is the size of the database. */
520     double est_db_size(ESTDB *db);
521    
522    
523     /* Search documents corresponding a condition for a database.
524     `db' specifies a database object.
525     `cond' specifies a condition object.
526     `nump' specifies the pointer to a variable to which the number of elements in the result is
527     assigned.
528     `hints' specifies a map object into which the number of documents corresponding to each word
529     is stored. If a word is in a negative condition, the number is negative. The element whose
530     key is an empty string specifies the number of whole result. If it is `NULL', it is not used.
531     The return value is an array whose elements are ID numbers of corresponding documents.
532     This function does never fail. Even if no document corresponds or an error occurs, an empty
533     array is returned. Because the region of the return value is allocated with the `malloc'
534     call, it should be released with the `free' call if it is no longer in use. */
535     int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints);
536    
537    
538     /* Set the maximum size of the cache memory of a database.
539     `db' specifies a database object.
540     `size' specifies the maximum size of the index cache. By default, it is 64MB. If it is not
541     more than 0, the current size is not changed.
542     `anum' specifies the maximum number of cached records for document attributes. By default, it
543     is 8192. If it is not more than 0, the current size is not changed.
544     `tnum' specifies the maximum number of cached records for document texts. By default, it is
545     1024. If it is not more than 0, the current size is not changed. */
546     void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum);
547    
548    
549     /* Set the special cache for narrowing and sorting with document attributes.
550     `db' specifies a database object.
551     `name' specifies the name of a document.
552     `num' specifies the maximum number of cached records. */
553     void est_db_set_special_cache(ESTDB *db, const char *name, int num);
554    
555    
556    
557     /*************************************************************************************************
558     * features for experts
559     *************************************************************************************************/
560    
561    
562     #define _EST_VERSION "0.4.0"
563     #define _EST_LIBVER 200
564     #define _EST_PROTVER "0.9"
565    
566     enum { /* enumeration for languages */
567     ESTLANGEN, /* English */
568     ESTLANGJA, /* Japanese */
569     ESTLANGZH, /* Chinese */
570     ESTLANGKO, /* Korean */
571     ESTLANGMISC /* miscellaneous */
572     };
573    
574    
575     /* Break a sentence of text and extract words.
576     `text' specifies a sentence of text.
577     `list' specifies a list object to which extract words are added.
578     `norm' specifies whether to normalize the text.
579     `tail' specifies whether to pick up oddness N-gram at the end. */
580     void est_break_text(const char *text, CBLIST *list, int norm, int tail);
581    
582    
583     /* Break a sentence of text and extract words using perfect N-gram analyzer.
584     `text' specifies a sentence of text.
585     `list' specifies a list object to which extract words are added.
586     `norm' specifies whether to normalize the text.
587     `tail' specifies whether to pick up oddness N-gram at the end. */
588     void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail);
589    
590    
591     /* Convert the character encoding of a string.
592     `ptr' specifies the pointer to a region.
593     `size' specifies the size of the region. If it is negative, the size is assigned with
594     `strlen(ptr)'.
595     `icode' specifies the name of encoding of the input string.
596     `ocode' specifies the name of encoding of the output string.
597     `sp' specifies the pointer to a variable to which the size of the region of the return
598     value is assigned. If it is `NULL', it is not used.
599     `mp' specifies the pointer to a variable to which the number of missing characters by failure
600     of conversion is assigned. If it is `NULL', it is not used.
601     If successful, the return value is the pointer to the result object, else, it is `NULL'.
602     Because an additional zero code is appended at the end of the region of the return value,
603     the return value can be treated as a character string. Because the region of the return
604     value is allocated with the `malloc' call, it should be released with the `free' call if it
605     is no longer in use. */
606     char *est_iconv(const char *ptr, int size, const char *icode, const char *ocode,
607     int *sp, int *mp);
608    
609    
610     /* Detect the encoding of a string automatically.
611     `ptr' specifies the pointer to a region.
612     `size' specifies the size of the region. If it is negative, the size is assigned with
613     `strlen(ptr)'.
614     `plang' specifies a preferred language. As for now, `ESTLANGEN', `ESTLANGJA', `ESTLANGZH',
615     and `ESTLANGKO' are supported.
616     The return value is the string of the encoding name of the string. */
617     const char *est_enc_name(const char *ptr, int size, int plang);
618    
619    
620     /* Convert a UTF-8 string into UTF-16BE.
621     `ptr' specifies the pointer to a region.
622     `size' specifies the size of the region.
623     `sp' specifies the pointer to a variable to which the size of the region of the return
624     value is assigned.
625     The return value is the pointer to the result object. Because an additional zero code is
626     appended at the end of the region of the return value, the return value can be treated as a
627     character string. Because the region of the return value is allocated with the `malloc' call,
628     it should be released with the `free' call if it is no longer in use. */
629     char *est_uconv_in(const char *ptr, int size, int *sp);
630    
631    
632     /* Convert a UTF-16BE string into UTF-8.
633     `ptr' specifies the pointer to a region.
634     `size' specifies the size of the region.
635     `sp' specifies the pointer to a variable to which the size of the region of the return
636     value is assigned. If it is `NULL', it is not used.
637     The return value is the pointer to the result object. Because an additional zero code is
638     appended at the end of the region of the return value, the return value can be treated as a
639     character string. Because the region of the return value is allocated with the `malloc' call,
640     it should be released with the `free' call if it is no longer in use. */
641     char *est_uconv_out(const char *ptr, int size, int *sp);
642    
643    
644     /* Compress a serial object with ZLIB.
645     `ptr' specifies the pointer to a region.
646     `size' specifies the size of the region. If it is negative, the size is assigned with
647     `strlen(ptr)'.
648     `sp' specifies the pointer to a variable to which the size of the region of the return
649     value is assigned.
650     If successful, the return value is the pointer to the result object, else, it is `NULL'.
651     Because the region of the return value is allocated with the `malloc' call, it should be
652     released with the `free' call if it is no longer in use. */
653     char *est_deflate(const char *ptr, int size, int *sp);
654    
655    
656     /* Decompress a serial object compressed with ZLIB.
657     `ptr' specifies the pointer to a region.
658     `size' specifies the size of the region.
659     `sp' specifies the pointer to a variable to which the size of the region of the return
660     value is assigned. If it is `NULL', it is not used.
661     If successful, the return value is the pointer to the result object, else, it is `NULL'.
662     Because an additional zero code is appended at the end of the region of the return value,
663     the return value can be treated as a character string. Because the region of the return
664     value is allocated with the `malloc' call, it should be released with the `free' call if it
665     is no longer in use. */
666     char *est_inflate(const char *ptr, int size, int *sp);
667    
668    
669     /* Get the border string for draft data of documents.
670     The return value is the border string for draft data of documents. */
671     const char *est_border_str(void);
672    
673    
674     /* Get the real random number.
675     The return value is the real random number between 0.0 and 1.0. */
676     double est_random(void);
677    
678    
679     /* Get the random number in normal distribution.
680     The return value is the random number in normal distribution between 0.0 and 1.0. */
681     double est_random_nd(void);
682    
683    
684     /* Get an MD5 hash string of a key string.
685     `key' specifies a string to be encrypted.
686     The return value is an MD5 hash string of the key string. Because the region of the return
687     value is allocated with the `malloc' call, it should be released with the `free' call if it
688     is no longer in use. */
689     char *est_make_crypt(const char *key);
690    
691    
692     /* Check whether a key matches an MD5 hash string.
693     `key' specifies a string to be checked.
694     `hash' specifies an MD5 hash string.
695     The return value is true if the key matches the hash string, else it is false. */
696     int est_match_crypt(const char *key, const char *hash);
697    
698    
699     /* Get the hidden texts of a document object.
700     `doc' specifies a document object.
701     The return value is concatenated sentences of the hidden text of the document object. The
702     life duration of the returned string is synchronous with the one of the document object. */
703     const char *est_doc_hidden_texts(ESTDOC *doc);
704    
705    
706     /* Get the phrase of a condition object.
707     `cond' specifies a condition object.
708     The return value is the phrase of a condition object or `NULL' if it is not specified. The
709     life duration of the returned string is synchronous with the one of the condition object. */
710     const char *est_cond_phrase(ESTCOND *cond);
711    
712    
713     /* Get a list object of attribute expressions of a condition object.
714     `cond' specifies a condition object.
715     The return value is a list object of attribute expressions of a condition object or `NULL' if
716     it is not specified. The life duration of the returned object is synchronous with the one of
717     the condition object. */
718     const CBLIST *est_cond_attrs(ESTCOND *cond);
719    
720    
721     /* Get the order expression of a condition object.
722     `cond' specifies a condition object.
723     The return value is the order expression of a condition object or `NULL' if it is not
724     specified. The life duration of the returned string is synchronous with the one of the
725     condition object. */
726     const char *est_cond_order(ESTCOND *cond);
727    
728    
729     /* Get the maximum number of retrieval of a condition object.
730     `cond' specifies a condition object.
731     The return value is the maximum number of retrieval of a condition object or -1 if it is not
732     specified. */
733     int est_cond_max(ESTCOND *cond);
734    
735    
736     /* Get the options of a condition object.
737     `cond' specifies a condition object.
738     The return value is the options of a condition object. */
739     int est_cond_options(ESTCOND *cond);
740    
741    
742     /* Get the score of a document corresponding to a condition object.
743     `cond' specifies a condition object.
744     `index' specifies the index of an element of the result array of `est_db_search'.
745     The return value is the score of the element or -1 if the index is out of bounds. */
746     int est_cond_score(ESTCOND *cond, int index);
747    
748    
749     /* Set the error code of a database.
750     `db' specifies a database object.
751     `ecode' specifies a error code to set. */
752     void est_db_set_ecode(ESTDB *db, int ecode);
753    
754    
755     /* Edit attributes of a document object in a database.
756     `db' specifies a database object connected as a writer.
757     `doc' specifies a document object.
758     The return value is true if success, else it is false. */
759     int est_db_edit_doc(ESTDB *db, ESTDOC *doc);
760    
761    
762     /* Add a piece of meta data to a database.
763     `db' specifies a database object connected as a writer.
764     `name' specifies the name of a piece of meta data.
765     `value' specifies the value of the meta data. If it is `NULL', the meta data is removed. */
766     void est_db_add_meta(ESTDB *db, const char *name, const char *value);
767    
768    
769     /* Get a list of names of meta data of a database.
770     `db' specifies a database object.
771     The return value is a new list object of meta data names of the document object. Because the
772     object of the return value is opened with the function `cblistopen', it should be closed with
773     the function `cblistclose' if it is no longer in use. */
774     CBLIST *est_db_meta_names(ESTDB *db);
775    
776    
777     /* Get the value of a piece of meta data of a database.
778     `db' specifies a database object.
779     `name' specifies the name of a piece of meta data.
780     The return value is the value of the meta data or `NULL' if it does not exist. Because the
781     region of the return value is allocated with the `malloc' call, it should be released with
782     the `free' call if it is no longer in use. */
783     char *est_db_meta(ESTDB *db, const char *name);
784    
785    
786     /* Get the number of records in the cache memory of a database.
787     `db' specifies a database object.
788     The return value is the cache memory of a database. */
789     int est_db_cache_num(ESTDB *db);
790    
791    
792     /* Set the callback function to inform of database events.
793     `db' specifies a database object.
794     `func' specifies the pointer to a function. The argument of the callback specifies a message
795     of each event. */
796     void est_db_set_informer(ESTDB *db, void (*func)(const char *));
797    
798    
799     /* Set the callback function to create a vector of keywords of a document.
800     `db' specifies a database object.
801     `func' specifies the pointer to a function. The arguments of the callback specify the
802     database object, the ID of a document, and an arbitrary pointer. The return value is the
803     callback is a new map object conforming to the return value of `est_db_etch_doc'.
804     `data' specifies the pointer to an object given as the third argument of the callback. */
805     void est_db_set_vectorizer(ESTDB *db, CBMAP *(*func)(void *, int, void *), void *data);
806    
807    
808     /* Fill the cache for keys for TF-IDF.
809     `db' specifies a database object. */
810     void est_db_fill_key_cache(ESTDB *db);
811    
812    
813     /* Make a directory.
814     `path' specifies the path of a new directory.
815     The return value is true if success, else it is false. */
816     int est_mkdir(const char *path);
817    
818    
819     /* Remove a directory and its contents recursively.
820     `path' specifies the path of a directory.
821     The return value is true if success, else it is false. */
822     int est_rmdir_rec(const char *path);
823    
824    
825     /* Get the canonicalized absolute pathname of a file.
826     `path' specifies the path of a new directory.
827     The return value is the canonicalized absolute pathname of a file. Because the region of the
828     return value is allocated with the `malloc' call, it should be released with the `free' call
829     if it is no longer in use. */
830     char *est_realpath(const char *path);
831    
832    
833     /* Get the time of day in milliseconds.
834     The return value is the time of day in milliseconds. */
835     double est_gettimeofday(void);
836    
837    
838     /* Suspend execution for microsecond intervals.
839     `usec' specifies microseconds to sleep for. */
840     void est_usleep(unsigned long usec);
841    
842    
843     /* Send a signal to a process.
844     `pid' specifies the PID of a target process.
845     `sig' specifies a signal code.
846     The return value is true if success, else it is false. */
847     int est_kill(int pid, int sig);
848    
849    
850     /* Get the media type of an extention.
851     `ext' specifies the extension of a file path.
852     The return value is the media time of the extension. */
853     const char *est_ext_type(const char *ext);
854    
855    
856    
857     #if defined(__cplusplus) /* export for C++ */
858     }
859     #endif
860    
861     #endif /* duplication check */
862    
863    
864     /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26