1 |
/** |
2 |
* HyperEstraierWrapper.cpp - C++ wrapper for Hyper Estraier |
3 |
*/ |
4 |
#include <estraier.h> |
5 |
#include <estmtdb.h> |
6 |
#include <cabin.h> |
7 |
#include <cstdlib> |
8 |
#include <string> |
9 |
#include <vector> |
10 |
#include <map> |
11 |
#include <cassert> |
12 |
#include <stdexcept> |
13 |
#include <estnode.h> |
14 |
|
15 |
/* backward compatibility for 0.5.4 */ |
16 |
/* |
17 |
#ifndef ESTCONDAGITO |
18 |
#define ESTCONDAGITO ESTCONDAGIT |
19 |
#endif |
20 |
*/ |
21 |
|
22 |
namespace estraier { |
23 |
|
24 |
class IOError : public std::runtime_error { |
25 |
public: |
26 |
explicit IOError (const std::string& w) : std::runtime_error(w) {} |
27 |
}; |
28 |
|
29 |
class Condition { |
30 |
public: |
31 |
enum { // enumeration for options |
32 |
SURE = ESTCONDSURE, // check every N-gram key |
33 |
USUAL = ESTCONDUSUAL, // check N-gram keys skipping by one |
34 |
FAST = ESTCONDFAST, // check N-gram keys skipping by two |
35 |
AGITO = ESTCONDAGITO, // check N-gram keys skipping by three |
36 |
NOIDF = ESTCONDNOIDF, // without TF-IDF tuning |
37 |
SIMPLE = ESTCONDSIMPLE, // with the simplefied phrase |
38 |
}; |
39 |
ESTCOND * cond; |
40 |
Condition() { |
41 |
/** |
42 |
* constructor |
43 |
*/ |
44 |
cond = est_cond_new(); |
45 |
} |
46 |
~Condition() { |
47 |
/** |
48 |
* destructor |
49 |
*/ |
50 |
est_cond_delete(cond); |
51 |
} |
52 |
void set_phrase(const char *phrase) { |
53 |
/** |
54 |
* set the search phrase |
55 |
*/ |
56 |
est_cond_set_phrase(cond, phrase); |
57 |
} |
58 |
void add_attr(const char *expr) { |
59 |
/** |
60 |
* set the attribute expression |
61 |
*/ |
62 |
est_cond_add_attr(cond, expr); |
63 |
} |
64 |
void set_order(const char *expr) { |
65 |
/** |
66 |
* set the order of a condition object |
67 |
*/ |
68 |
est_cond_set_order(cond, expr); |
69 |
} |
70 |
void set_max(int _max) { |
71 |
/** |
72 |
* set the maximum number of retrieval of a condition object |
73 |
*/ |
74 |
est_cond_set_max(cond, _max); |
75 |
} |
76 |
void set_options(int options) { |
77 |
/** |
78 |
* set options of retrieval of a condition object |
79 |
*/ |
80 |
est_cond_set_options(cond, options); |
81 |
} |
82 |
}; |
83 |
|
84 |
class Document { |
85 |
private: |
86 |
std::string text_buf; |
87 |
public: |
88 |
ESTDOC *doc; |
89 |
Document() { |
90 |
/** |
91 |
* constructor |
92 |
*/ |
93 |
doc = est_doc_new(); |
94 |
} |
95 |
Document(const char* draft) { |
96 |
/** |
97 |
* constructor |
98 |
*/ |
99 |
doc = est_doc_new_from_draft(draft); |
100 |
} |
101 |
Document(ESTDOC *_doc) { |
102 |
/** |
103 |
* constructor |
104 |
*/ |
105 |
doc = _doc; |
106 |
} |
107 |
~Document() { |
108 |
/** |
109 |
* destructor |
110 |
*/ |
111 |
est_doc_delete(doc); |
112 |
} |
113 |
void add_attr(const char * name, const char*value) { |
114 |
/** |
115 |
* add an attribute to a document object |
116 |
*/ |
117 |
est_doc_add_attr(doc, name, value); |
118 |
} |
119 |
void add_text(const char *text) { |
120 |
/** |
121 |
* add a sentence of text to a document object |
122 |
*/ |
123 |
est_doc_add_text(doc, text); |
124 |
} |
125 |
void add_hidden_text(const char * text) { |
126 |
/** |
127 |
* add a hidden sentence to a document object |
128 |
*/ |
129 |
est_doc_add_hidden_text(doc, text); |
130 |
} |
131 |
int id() { |
132 |
/** |
133 |
* get the ID number of a document object |
134 |
*/ |
135 |
return est_doc_id(doc); |
136 |
} |
137 |
std::vector<std::string> * attr_names() { |
138 |
/** |
139 |
* get a list of attribute names of a document object |
140 |
*/ |
141 |
std::vector<std::string> * vs = new std::vector<std::string>; |
142 |
CBLIST * attr_names = est_doc_attr_names(doc); |
143 |
for (int i=0; i < cblistnum(attr_names); i++) { |
144 |
vs->push_back(cblistval(attr_names, i, NULL)); |
145 |
} |
146 |
cblistclose(attr_names); |
147 |
return vs; |
148 |
} |
149 |
const char * attr(const char *name) { |
150 |
/** |
151 |
* get the value of an attribute of a document object |
152 |
*/ |
153 |
return est_doc_attr(doc, name); |
154 |
} |
155 |
const char * cat_texts() { |
156 |
/** |
157 |
* get a list of sentences of the text of a document object |
158 |
*/ |
159 |
return est_doc_cat_texts(doc); |
160 |
} |
161 |
std::vector<std::string>* texts() { |
162 |
/** |
163 |
* get a list of sentences of the text of a document object |
164 |
*/ |
165 |
std::vector<std::string> * vs = new std::vector<std::string>; |
166 |
const CBLIST *texts; |
167 |
texts = est_doc_texts(doc); |
168 |
for(int i = 0; i < cblistnum(texts); i++) { |
169 |
vs->push_back(cblistval(texts, i, NULL)); |
170 |
} |
171 |
return vs; |
172 |
} |
173 |
const char * dump_draft() { |
174 |
/** |
175 |
* dump draft data of a document object |
176 |
*/ |
177 |
return est_doc_dump_draft(doc); |
178 |
} |
179 |
const char * make_snippet(std::vector<std::string> _words, int wwidth, int hwidth, int awidth) { |
180 |
/** |
181 |
* make a snippet of the body text of a document object |
182 |
*/ |
183 |
CBLIST * words; |
184 |
std::vector<std::string>::iterator iter; |
185 |
words = cblistopen(); |
186 |
for (iter = _words.begin(); _words.end() != iter; iter++) { |
187 |
cblistpush(words, iter->c_str(), -1); |
188 |
} |
189 |
const char *result = est_doc_make_snippet(doc, words, wwidth, hwidth, awidth); |
190 |
cblistclose(words); |
191 |
return result; |
192 |
} |
193 |
const char * hidden_texts() { |
194 |
/** |
195 |
* get the hidden texts of a document object. |
196 |
*/ |
197 |
return est_doc_hidden_texts(doc); |
198 |
} |
199 |
}; |
200 |
|
201 |
class Database { |
202 |
private: |
203 |
ESTMTDB *db; |
204 |
int ecode; |
205 |
public: |
206 |
enum { // enumeration for error codes |
207 |
ERRNOERR = ESTENOERR, // no error |
208 |
ERRINVAL = ESTEINVAL, // invalid argument |
209 |
ERRACCES = ESTEACCES, // access forbidden |
210 |
ERRLOCK = ESTELOCK, // lock failure |
211 |
ERRDB = ESTEDB, // database problem |
212 |
ERRIO = ESTEIO, // I/O problem |
213 |
ERRNOITEM = ESTENOITEM, // no item |
214 |
ERRMISC = ESTEMISC // miscellaneous |
215 |
}; |
216 |
enum { // enumeration for open modes |
217 |
DBREADER = ESTDBREADER, // open as a reader |
218 |
DBWRITER = ESTDBWRITER, // open as a writer |
219 |
DBCREAT = ESTDBCREAT, // a writer creating |
220 |
DBTRUNC = ESTDBTRUNC, // a writer truncating |
221 |
DBNOLCK = ESTDBNOLCK, // open without locking |
222 |
DBLCKNB = ESTDBLCKNB, // lock without blocking |
223 |
DBPERFNG = ESTDBPERFNG // use perfect N-gram analyzer |
224 |
}; |
225 |
enum { // enumeration for options of document registration |
226 |
PDCLEAN = ESTPDCLEAN // clean up dispensable regions |
227 |
}; |
228 |
enum { // enumeration for options of document deletion |
229 |
ODCLEAN = ESTODCLEAN // clean up dispensable regions |
230 |
}; |
231 |
enum { // enumeration for options of optimization |
232 |
OPTNOPURGE = ESTOPTNOPURGE, // omit purging dispensable region of deleted |
233 |
OPTNODBOPT = ESTOPTNODBOPT // omit optimizization of the database files |
234 |
}; |
235 |
enum { // enumeration for options of document retrieval |
236 |
GDNOATTR = ESTGDNOATTR, // no attributes |
237 |
GDNOTEXT = ESTGDNOTEXT // no text |
238 |
}; |
239 |
Database() { |
240 |
/** |
241 |
* constructor(dummy) |
242 |
*/ |
243 |
db = NULL; |
244 |
ecode = ERRNOERR; |
245 |
} |
246 |
~Database() { |
247 |
if (db) close(); |
248 |
} |
249 |
bool open(const char * dbname, int mode) { |
250 |
/** |
251 |
* open the database |
252 |
*/ |
253 |
if (db) close(); |
254 |
int ec; |
255 |
db = est_mtdb_open(dbname, mode, &ec); |
256 |
if (!db) ecode = ec; |
257 |
return db; |
258 |
} |
259 |
bool close() { |
260 |
/** |
261 |
* close the database |
262 |
*/ |
263 |
if (!db) throw IOError("closed database"); |
264 |
int ec; |
265 |
bool result = est_mtdb_close(db, &ec); |
266 |
if (!result) ecode = ec; |
267 |
db = NULL; |
268 |
return result; |
269 |
} |
270 |
bool put_doc(Document *doc, int options) { |
271 |
/** |
272 |
* add a document to a database |
273 |
*/ |
274 |
if (!db) throw IOError("closed database"); |
275 |
bool result = est_mtdb_put_doc(db, doc->doc, options); |
276 |
if (!result) ecode = est_mtdb_error(db); |
277 |
return result; |
278 |
} |
279 |
std::vector<int> * search(Condition * cond, int options) { |
280 |
/** |
281 |
* search documents corresponding a condition for a database |
282 |
*/ |
283 |
if (!db) throw IOError("closed database"); |
284 |
int resnum; |
285 |
int * result = est_mtdb_search(db, cond->cond, &resnum, NULL); |
286 |
std::vector<int> *numbers = new std::vector<int>; |
287 |
for (int i=0; i<resnum; i++) { |
288 |
numbers->push_back(result[i]); |
289 |
} |
290 |
return numbers; |
291 |
} |
292 |
static const char * err_msg(int ecode) { |
293 |
/** |
294 |
* get the string of an error |
295 |
*/ |
296 |
return est_err_msg(ecode); |
297 |
} |
298 |
int error() { |
299 |
/** |
300 |
* get the last happended error code of a database |
301 |
*/ |
302 |
return ecode; |
303 |
} |
304 |
bool fatal() { |
305 |
/** |
306 |
* check whether a database has a fatal error |
307 |
*/ |
308 |
if (!db) throw IOError("closed database"); |
309 |
return est_mtdb_fatal(db); |
310 |
} |
311 |
bool flush(int _max) { |
312 |
/** |
313 |
* flush index words in the cache of a database |
314 |
*/ |
315 |
if (!db) throw IOError("closed database"); |
316 |
bool result = est_mtdb_flush(db, _max); |
317 |
if (!result) ecode = est_mtdb_error(db); |
318 |
return result; |
319 |
} |
320 |
bool sync() { |
321 |
/** |
322 |
* synchronize updating contents of a database |
323 |
*/ |
324 |
if (!db) throw IOError("closed database"); |
325 |
bool result = est_mtdb_sync(db); |
326 |
if (!result) ecode = est_mtdb_error(db); |
327 |
return result; |
328 |
} |
329 |
bool optimize(int options) { |
330 |
/** |
331 |
* optimize a database |
332 |
*/ |
333 |
if (!db) throw IOError("closed database"); |
334 |
bool result = est_mtdb_optimize(db, options); |
335 |
if (!result) ecode = est_mtdb_error(db); |
336 |
return result; |
337 |
} |
338 |
bool out_doc(int id, int options) { |
339 |
/** |
340 |
* remove a document from a database |
341 |
*/ |
342 |
if (!db) throw IOError("closed database"); |
343 |
bool result = est_mtdb_out_doc(db, id, options); |
344 |
if (!result) ecode = est_mtdb_error(db); |
345 |
return result; |
346 |
} |
347 |
bool edit_doc(Document *doc) { |
348 |
/** |
349 |
* edit an attribute of a document in a database |
350 |
*/ |
351 |
if (!db) throw IOError("closed database"); |
352 |
bool result = est_mtdb_edit_doc(db, doc->doc); |
353 |
if (!result) ecode = est_mtdb_error(db); |
354 |
return result; |
355 |
} |
356 |
Document * get_doc(int id, int options) { |
357 |
/** |
358 |
* retrieve a document in a database |
359 |
*/ |
360 |
if (!db) throw IOError("closed database"); |
361 |
ESTDOC *doc = est_mtdb_get_doc(db, id, options); |
362 |
if (!doc) { |
363 |
ecode = est_mtdb_error(db); |
364 |
throw est_err_msg(est_mtdb_error(db)); |
365 |
} else { |
366 |
return new Document(doc); |
367 |
} |
368 |
} |
369 |
int uri_to_id(const char *uri) { |
370 |
/** |
371 |
* get the ID of a document spacified by URI |
372 |
*/ |
373 |
if (!db) throw IOError("closed database"); |
374 |
int result = est_mtdb_uri_to_id(db, uri); |
375 |
if(result == -1) ecode = est_mtdb_error(db); |
376 |
return result; |
377 |
} |
378 |
std::map<std::string, std::string> * etch_doc(Document * doc, int max) { |
379 |
/** |
380 |
* extract keywords of a document object |
381 |
*/ |
382 |
if (!db) throw IOError("closed database"); |
383 |
std::map<std::string, std::string> * mss = new std::map<std::string, std::string>; |
384 |
CBMAP * keys = est_mtdb_etch_doc(db, doc->doc, max); |
385 |
cbmapiterinit(keys); |
386 |
int ksiz; |
387 |
while (const char *key = cbmapiternext(keys, &ksiz)) { |
388 |
mss->insert(std::make_pair(key, cbmapget(keys, key, ksiz, NULL))); |
389 |
} |
390 |
return mss; |
391 |
} |
392 |
const char * name() { |
393 |
/** |
394 |
* get the name of a database |
395 |
*/ |
396 |
if (!db) throw IOError("closed database"); |
397 |
return est_mtdb_name(db); |
398 |
} |
399 |
int doc_num() { |
400 |
/** |
401 |
* get the number of documents in a database |
402 |
*/ |
403 |
if (!db) throw IOError("closed database"); |
404 |
return est_mtdb_doc_num(db); |
405 |
} |
406 |
int word_num() { |
407 |
/** |
408 |
* get the number of unique words in a database |
409 |
*/ |
410 |
if (!db) throw IOError("closed database"); |
411 |
return est_mtdb_word_num(db); |
412 |
} |
413 |
double size() { |
414 |
/** |
415 |
* get the size of a database |
416 |
*/ |
417 |
if (!db) throw IOError("closed database"); |
418 |
return est_mtdb_size(db); |
419 |
} |
420 |
void set_cache_size(size_t size, int anum, int tnum, int rnum) { |
421 |
/** |
422 |
* set the maximum size of the cache memory of a database |
423 |
*/ |
424 |
if (!db) throw IOError("closed database"); |
425 |
est_mtdb_set_cache_size(db, size, anum, tnum, rnum); |
426 |
} |
427 |
void set_special_cache(const char *name, int num) { |
428 |
/** |
429 |
* Set the special cache for narrowing and sorting |
430 |
* with document attributes |
431 |
*/ |
432 |
est_mtdb_set_special_cache(db, name, num); |
433 |
} |
434 |
}; |
435 |
|
436 |
static std::vector<std::string> * break_text(const char *text, bool norm, bool tail) { |
437 |
std::vector<std::string> * vs = new std::vector<std::string>; |
438 |
CBLIST *list; |
439 |
list = cblistopen(); |
440 |
est_break_text(text, list, norm, tail); |
441 |
for (int i=0; i < cblistnum(list); i++) { |
442 |
vs->push_back(cblistval(list, i, NULL)); |
443 |
} |
444 |
cblistclose(list); |
445 |
return vs; |
446 |
} |
447 |
|
448 |
static std::vector<std::string> * break_text_perfng(const char *text, bool norm, bool tail) { |
449 |
std::vector<std::string> * vs = new std::vector<std::string>; |
450 |
CBLIST *list; |
451 |
list = cblistopen(); |
452 |
est_break_text_perfng(text, list, norm, tail); |
453 |
for (int i=0; i < cblistnum(list); i++) { |
454 |
vs->push_back(cblistval(list, i, NULL)); |
455 |
} |
456 |
cblistclose(list); |
457 |
return vs; |
458 |
} |
459 |
|
460 |
class ResultDocument { |
461 |
public: |
462 |
ESTRESDOC *rdoc; |
463 |
ResultDocument(ESTRESDOC *_rdoc) { |
464 |
rdoc = _rdoc; |
465 |
} |
466 |
const char *uri(void) { |
467 |
return est_resdoc_uri(rdoc); |
468 |
} |
469 |
std::vector<std::string> * attr_names() { |
470 |
std::vector<std::string> * vs = new std::vector<std::string>; |
471 |
CBLIST * attr_names = est_resdoc_attr_names(rdoc); |
472 |
for (int i=0; i < cblistnum(attr_names); i++) { |
473 |
vs->push_back(cblistval(attr_names, i, NULL)); |
474 |
} |
475 |
cblistclose(attr_names); |
476 |
return vs; |
477 |
} |
478 |
const char *attr(const char *name) { |
479 |
return est_resdoc_attr(rdoc, name); |
480 |
} |
481 |
const char *snippet(void) { |
482 |
return est_resdoc_snippet(rdoc); |
483 |
} |
484 |
}; |
485 |
|
486 |
class NodeRes { |
487 |
private: |
488 |
ESTNODERES *nres; |
489 |
public: |
490 |
NodeRes(ESTNODE *node, Condition *cond, int depth) { |
491 |
nres = est_node_search(node, cond->cond, depth); |
492 |
} |
493 |
~NodeRes() { |
494 |
est_noderes_delete(nres); |
495 |
} |
496 |
std::map<std::string, std::string> * hints(void) { |
497 |
std::map<std::string, std::string> * hints = new std::map<std::string, std::string>; |
498 |
CBMAP * keys = est_noderes_hints(nres); |
499 |
cbmapiterinit(keys); |
500 |
int ksiz; |
501 |
while (const char *key = cbmapiternext(keys, &ksiz)) { |
502 |
hints->insert(std::make_pair(key, cbmapget(keys, key, ksiz, NULL))); |
503 |
} |
504 |
return hints; |
505 |
} |
506 |
int doc_num(void) { |
507 |
return est_noderes_doc_num(nres); |
508 |
} |
509 |
ResultDocument * get_doc(int index) { |
510 |
ESTRESDOC *rdoc = est_noderes_get_doc(nres, index); |
511 |
if (rdoc) { |
512 |
return new ResultDocument(rdoc); |
513 |
} else { |
514 |
return NULL; |
515 |
} |
516 |
} |
517 |
}; |
518 |
|
519 |
class Node { |
520 |
private: |
521 |
ESTNODE *node; |
522 |
int netenv_ok; |
523 |
public: |
524 |
Node(const char *url) { |
525 |
netenv_ok = est_init_net_env(); |
526 |
if (! netenv_ok) throw IOError("can't init net env"); |
527 |
node = est_node_new(url); |
528 |
if (! node) throw IOError("can't create node"); |
529 |
} |
530 |
~Node() { |
531 |
est_node_delete(node); |
532 |
est_free_net_env(); |
533 |
} |
534 |
void set_proxy(const char *host, int port) { |
535 |
est_node_set_proxy(node, host, port); |
536 |
} |
537 |
void set_timeout(int sec) { |
538 |
est_node_set_timeout(node, sec); |
539 |
} |
540 |
void set_auth(const char *name, const char *passwd) { |
541 |
est_node_set_auth(node, name, passwd); |
542 |
} |
543 |
int status(void) { |
544 |
return est_node_status(node); |
545 |
} |
546 |
bool put_doc(Document *doc) { |
547 |
return est_node_put_doc(node, doc->doc); |
548 |
} |
549 |
bool out_doc(int id) { |
550 |
return est_node_out_doc(node, id); |
551 |
} |
552 |
bool out_doc_by_uri(const char *uri) { |
553 |
return est_node_out_doc_by_uri(node, uri); |
554 |
} |
555 |
#ifdef est_node_edit_doc |
556 |
bool edit_doc(Document *doc) { |
557 |
return est_node_edit_doc(node, doc->doc); |
558 |
} |
559 |
#endif |
560 |
Document * get_doc(int id) { |
561 |
ESTDOC *doc = est_node_get_doc(node, id); |
562 |
if (!doc) { |
563 |
return NULL; |
564 |
} else { |
565 |
return new Document(doc); |
566 |
} |
567 |
} |
568 |
Document * get_doc_by_uri(const char *uri) { |
569 |
ESTDOC *doc = est_node_get_doc_by_uri(node, uri); |
570 |
if (!doc) { |
571 |
return NULL; |
572 |
} else { |
573 |
return new Document(doc); |
574 |
} |
575 |
} |
576 |
char * get_doc_attr(int id, const char *name) { |
577 |
/* is this leeking memory? shouldn't I create |
578 |
* object and free memory region returned? |
579 |
*/ |
580 |
return est_node_get_doc_attr(node, id, name); |
581 |
} |
582 |
char * get_doc_attr_by_uri(const char *uri, const char *name) { |
583 |
return est_node_get_doc_attr_by_uri(node, uri, name); |
584 |
} |
585 |
int uri_to_id(const char *uri) { |
586 |
return est_node_uri_to_id(node, uri); |
587 |
} |
588 |
const char * name(void) { |
589 |
return est_node_name(node); |
590 |
} |
591 |
const char * label(void) { |
592 |
return est_node_label(node); |
593 |
} |
594 |
int doc_num(void) { |
595 |
return est_node_doc_num(node); |
596 |
} |
597 |
int word_num(void) { |
598 |
return est_node_word_num(node); |
599 |
} |
600 |
double size(void) { |
601 |
return est_node_size(node); |
602 |
} |
603 |
NodeRes * search(Condition *cond, int depth) { |
604 |
return new NodeRes(node, cond, depth); |
605 |
} |
606 |
int set_user(const char *name, int mode) { |
607 |
return est_node_set_user(node, name, mode); |
608 |
} |
609 |
int set_link(const char *url, const char *label, int credit) { |
610 |
return est_node_set_link(node, url, label, credit); |
611 |
} |
612 |
}; |
613 |
|
614 |
}; |