/[hyperestraier]/upstream/0.5.3/estcmd.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /upstream/0.5.3/estcmd.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (show annotations)
Wed Aug 3 15:25:48 2005 UTC (18 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 106455 byte(s)
import of upstream 0.5.3

1 /*************************************************************************************************
2 * The command line interface for the core API
3 * Copyright (C) 2004-2005 Mikio Hirabayashi
4 * This file is part of Hyper Estraier.
5 * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6 * the GNU Lesser General Public License as published by the Free Software Foundation; either
7 * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10 * License for more details.
11 * You should have received a copy of the GNU Lesser General Public License along with Hyper
12 * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13 * Boston, MA 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #include "estraier.h"
18 #include "myconf.h"
19
20 #define NUMBUFSIZ 32 /* size of a buffer for a number */
21 #define URIBUFSIZ 8192 /* size of a buffer for an URI */
22 #define MINIBNUM 31 /* bucket number of a small map */
23 #define SEARCHMAX 10 /* maximum number of shown documents */
24 #define SNIPWWIDTH 480 /* whole width of the snippet */
25 #define SNIPHWIDTH 96 /* width of beginning of the text */
26 #define SNIPAWIDTH 96 /* width around each highlighted word */
27 #define CACHEMAX (512*1024*1024) /* max chache size by mega bytes */
28 #define DATTRLPATH "_lpath" /* name of the attribute of the local path */
29 #define DATTRLFILE "_lfile" /* name of the attribute of the local file name */
30 #define DATTRSCORE "#score" /* name of the pseudo-attribute of score */
31 #define DATTRKWORDS "#kwords" /* name of the pseudo-attribute of keywords */
32 #define KWDBNAME "kwords" /* name of the database for keywords */
33 #define KWDBBNUM 122869 /* bucket number of the keyword database */
34 #define KWDBDNUM 3 /* division number of the keyword database */
35 #define KWORDNUM 32 /* number of shown keywords */
36 #define RDOCSNUM 6 /* number of sections of a raondom document */
37 #define RDOCCNUM 256 /* number of characters for int a section */
38
39 enum { /* enumeration for viewing modes */
40 VM_ID, /* ID only */
41 VM_URI, /* ID and URI */
42 VM_ATTR, /* all attributes */
43 VM_FULL, /* all attributes and body text */
44 VM_SNIP, /* all attributes and snippet */
45 VM_HMRD, /* human readable */
46 VM_XML, /* XML */
47 VM_DUMP /* dump draft files */
48 };
49
50 enum { /* enumeration for file formats */
51 FF_AUTO, /* automatic detection */
52 FF_DRAFT, /* draft */
53 FF_TEXT, /* plain text */
54 FF_HTML, /* HTML */
55 FF_MIME, /* MIME */
56 FF_NONE /* ignored */
57 };
58
59 enum { /* enumeration for test documents */
60 RD_ENG, /* English */
61 RD_LAT, /* Latin */
62 RD_EURO, /* European mix */
63 RD_ORI, /* Oriental */
64 RD_JPN, /* Japanese */
65 RD_CHAO, /* chaos */
66 RD_RAND /* selected at random */
67 };
68
69
70 /* global variables */
71 const char *g_progname; /* program name */
72 int g_sigterm = FALSE; /* flag for termination signal */
73 int g_putopts = 0; /* options of registration */
74 int g_outopts = 0; /* options of deletion */
75 int g_optopts = 0; /* options of optimization */
76 const char *g_inputcode = "UTF-8"; /* input encoding */
77 int g_inputlang = ESTLANGEN; /* prefered language */
78 const char *g_pathcode = NULL; /* path encoding */
79 int g_pathfull = FALSE; /* whether to record full paths */
80 CBLIST *g_pathattrs = NULL; /* names of elements in path extension */
81 int g_oextmodes = 0; /* extra open modes */
82 int g_viewmode = VM_ID; /* viewing mode */
83 int g_filefmt = FF_AUTO; /* file format */
84 CBMAP *g_xcmdmap = NULL; /* map of suffixes and filter commands */
85 int g_filtorig = FALSE; /* whether to use filter for original files */
86 int g_stdate = FALSE; /* whether to adopt date by stat */
87 int g_chkmdate = FALSE; /* whether to check modification date */
88 double g_cachesize = -1; /* size of the cache */
89 int g_doforce = FALSE; /* whether to force purging or extracting */
90 int g_kwordnum = KWORDNUM; /* number of keywords */
91 int g_rdmode = RD_RAND; /* mode of random documents */
92
93
94 /* function prototypes */
95 int main(int argc, char **argv);
96 static void printferror(const char *format, ...);
97 static void printfinfo(const char *format, ...);
98 static void dbinform(const char *msg);
99 static void setsignals(void);
100 static void sigtermhandler(int num);
101 static void usage(void);
102 static int runput(int argc, char **argv);
103 static int runout(int argc, char **argv);
104 static int runget(int argc, char **argv);
105 static int runlist(int argc, char **argv);
106 static int runuriid(int argc, char **argv);
107 static int runmeta(int argc, char **argv);
108 static int runinform(int argc, char **argv);
109 static int runoptimize(int argc, char **argv);
110 static int runsearch(int argc, char **argv);
111 static int rungather(int argc, char **argv);
112 static int runpurge(int argc, char **argv);
113 static int runextkeys(int argc, char **argv);
114 static int rundraft(int argc, char **argv);
115 static int runbreak(int argc, char **argv);
116 static int runrandput(int argc, char **argv);
117 static int runwicked(int argc, char **argv);
118 static int runregression(int argc, char **argv);
119 static int procput(const char *dbname, const char *filename);
120 static int procout(const char *dbname, int id, const char *expr);
121 static int procget(const char *dbname, int id, const char *expr, const char *attr);
122 static int proclist(const char *dbname);
123 static int procuriid(const char *dbname, const char *uri);
124 static int procmeta(const char *dbname, const char *mname, const char *mvalue);
125 static int procinform(const char *dbname);
126 static int procoptimize(const char *dbname);
127 static int procsearch(const char *dbname, const char *phrase,
128 const CBLIST *attrs, const char *ord, int max, int opts, int sim);
129 static int procgather(const char *dbname, const char *filename);
130 static int procpurge(const char *dbname, const char *prefix);
131 static int procextkeys(const char *dbname, const char *prefix, int ni);
132 static int procdraft(const char *filename);
133 static int procbreak(const char *filename, int wt);
134 static int procrandput(const char *dbname, int dnum);
135 static int procwicked(const char *dbname, int dnum);
136 static int procregression(const char *dbname);
137 static void xmlprintf(const char *format, ...);
138 static int strtolang(const char *str);
139 static char *fgetl(FILE *ifp);
140 static int doputdoc(ESTDB *db, const char *path, const CBLIST *attrs);
141 static const char *pathtourl(const char *path);
142 static const char *urltofile(const char *uri);
143 static char *urltopath(const char *uri);
144 static CBMAP *vectorizer(void *db, int id, void *kwdb);
145 static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
146 const char *xcmd, const char *tmpdir,
147 const char *penc, int plang);
148 static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc);
149 static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang);
150 static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang);
151 static char *est_html_enc(const char *str);
152 static char *est_html_raw_text(const char *html);
153 static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang);
154 static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value);
155 static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode);
156 static char *est_random_str(int cnum, int mode);
157
158
159 /* main routine */
160 int main(int argc, char **argv){
161 const char *tmp;
162 int rv;
163 if((tmp = getenv("ESTDBGFD")) != NULL) dpdbgfd = atoi(tmp);
164 cbstdiobin();
165 g_progname = argv[0];
166 g_sigterm = FALSE;
167 if(argc < 2) usage();
168 rv = 0;
169 if(!strcmp(argv[1], "put")){
170 setsignals();
171 rv = runput(argc, argv);
172 } else if(!strcmp(argv[1], "out")){
173 setsignals();
174 rv = runout(argc, argv);
175 } else if(!strcmp(argv[1], "get")){
176 rv = runget(argc, argv);
177 } else if(!strcmp(argv[1], "list")){
178 rv = runlist(argc, argv);
179 } else if(!strcmp(argv[1], "uriid")){
180 rv = runuriid(argc, argv);
181 } else if(!strcmp(argv[1], "meta")){
182 setsignals();
183 rv = runmeta(argc, argv);
184 } else if(!strcmp(argv[1], "inform")){
185 rv = runinform(argc, argv);
186 } else if(!strcmp(argv[1], "optimize")){
187 setsignals();
188 rv = runoptimize(argc, argv);
189 } else if(!strcmp(argv[1], "search")){
190 rv = runsearch(argc, argv);
191 } else if(!strcmp(argv[1], "gather")){
192 setsignals();
193 rv = rungather(argc, argv);
194 } else if(!strcmp(argv[1], "purge")){
195 setsignals();
196 rv = runpurge(argc, argv);
197 } else if(!strcmp(argv[1], "extkeys")){
198 setsignals();
199 rv = runextkeys(argc, argv);
200 } else if(!strcmp(argv[1], "draft")){
201 rv = rundraft(argc, argv);
202 } else if(!strcmp(argv[1], "break")){
203 rv = runbreak(argc, argv);
204 } else if(!strcmp(argv[1], "randput")){
205 setsignals();
206 rv = runrandput(argc, argv);
207 } else if(!strcmp(argv[1], "wicked")){
208 setsignals();
209 rv = runwicked(argc, argv);
210 } else if(!strcmp(argv[1], "regression")){
211 setsignals();
212 rv = runregression(argc, argv);
213 } else if(!strcmp(argv[1], "version") || !strcmp(argv[1], "--version")){
214 printf("Hyper Estraier %s on %s\n", est_version, ESTSYSNAME);
215 printf("Copyright (C) 2004-2005 Mikio Hirabayashi.\n");
216 rv = 0;
217 } else {
218 usage();
219 }
220 return rv;
221 }
222
223
224 /* print formatted error string and flush the buffer */
225 static void printferror(const char *format, ...){
226 va_list ap;
227 va_start(ap, format);
228 fprintf(stderr, "%s: ERROR: ", g_progname);
229 vfprintf(stderr, format, ap);
230 fputc('\n', stderr);
231 fflush(stderr);
232 va_end(ap);
233 }
234
235
236 /* print formatted information string and flush the buffer */
237 static void printfinfo(const char *format, ...){
238 va_list ap;
239 va_start(ap, format);
240 printf("%s: INFO: ", g_progname);
241 vprintf(format, ap);
242 putchar('\n');
243 fflush(stdout);
244 va_end(ap);
245 }
246
247
248 /* callback function for database events */
249 static void dbinform(const char *msg){
250 printfinfo("%s", msg);
251 }
252
253
254 /* set signal handlers */
255 static void setsignals(void){
256 signal(1, sigtermhandler);
257 signal(2, sigtermhandler);
258 signal(3, sigtermhandler);
259 signal(13, sigtermhandler);
260 signal(15, sigtermhandler);
261 }
262
263
264 /* handler of termination signal */
265 static void sigtermhandler(int num){
266 static int tries = 0;
267 if(tries++ <= 4){
268 signal(num, sigtermhandler);
269 } else {
270 signal(num, SIG_DFL);
271 }
272 g_sigterm = TRUE;
273 printfinfo("the termination signal %d catched", num);
274 }
275
276
277 /* print the usage and exit */
278 static void usage(void){
279 fprintf(stderr, "%s: command line utility for the core API of Hyper Estraier\n", g_progname);
280 fprintf(stderr, "\n");
281 fprintf(stderr, "usage:\n");
282 fprintf(stderr, " %s put [-cl] db [file]\n", g_progname);
283 fprintf(stderr, " %s out [-cl] db expr\n", g_progname);
284 fprintf(stderr, " %s get db expr\n", g_progname);
285 fprintf(stderr, " %s list db\n", g_progname);
286 fprintf(stderr, " %s uriid db uri\n", g_progname);
287 fprintf(stderr, " %s meta db [name [value]]\n", g_progname);
288 fprintf(stderr, " %s inform db\n", g_progname);
289 fprintf(stderr, " %s optimize [-onp] [-ond] db\n", g_progname);
290 fprintf(stderr, " %s search [-ic enc] [-vu|-va|-vf|-vs|-vh|-vx|-dd] [-gs|-gf|-ga]"
291 " [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n",
292 g_progname);
293 fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]"
294 " [-ic enc] [-il lang] [-pc enc] [-pf] [-px name] [-apn] [-sd] [-cm] [-cs num]"
295 " db [file|dir]\n", g_progname);
296 fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname);
297 fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname);
298 fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname);
299 fprintf(stderr, " %s break [-ic enc] [-il lang] [-apn] [-wt] [file]\n", g_progname);
300 fprintf(stderr, " %s randput [-ren|-rla|-reu|-ror|-rjp|-rch] [-cs num] db dnum\n",
301 g_progname);
302 fprintf(stderr, " %s wicked db dnum\n", g_progname);
303 fprintf(stderr, " %s regression db\n", g_progname);
304 fprintf(stderr, " %s version\n", g_progname);
305 fprintf(stderr, "\n");
306 exit(1);
307 }
308
309
310 /* parse arguments of the put command */
311 static int runput(int argc, char **argv){
312 char *dbname, *filename;
313 int i, rv;
314 dbname = NULL;
315 filename = NULL;
316 for(i = 2; i < argc; i++){
317 if(!dbname && argv[i][0] == '-'){
318 if(!strcmp(argv[i], "-cl")){
319 g_putopts |= ESTPDCLEAN;
320 } else {
321 usage();
322 }
323 } else if(!dbname){
324 dbname = argv[i];
325 } else if(!filename){
326 filename = argv[i];
327 } else {
328 usage();
329 }
330 }
331 if(!dbname) usage();
332 rv = procput(dbname, filename);
333 return rv;
334 }
335
336
337 /* parse arguments of the out command */
338 static int runout(int argc, char **argv){
339 char *dbname, *expr;
340 int i, id, rv;
341 dbname = NULL;
342 expr = NULL;
343 for(i = 2; i < argc; i++){
344 if(!dbname && argv[i][0] == '-'){
345 if(!strcmp(argv[i], "-cl")){
346 g_outopts |= ESTODCLEAN;
347 } else {
348 usage();
349 }
350 } else if(!dbname){
351 dbname = argv[i];
352 } else if(!expr){
353 expr = argv[i];
354 } else {
355 usage();
356 }
357 }
358 if(!dbname || !expr) usage();
359 if((id = atoi(expr)) > 0) expr = NULL;
360 rv = procout(dbname, id, expr);
361 return rv;
362 }
363
364
365 /* parse arguments of the get command */
366 static int runget(int argc, char **argv){
367 char *dbname, *expr, *attr;
368 int i, id, rv;
369 dbname = NULL;
370 expr = NULL;
371 attr = NULL;
372 for(i = 2; i < argc; i++){
373 if(!dbname && argv[i][0] == '-'){
374 usage();
375 } else if(!dbname){
376 dbname = argv[i];
377 } else if(!expr){
378 expr = argv[i];
379 } else if(!attr){
380 attr = argv[i];
381 } else {
382 usage();
383 }
384 }
385 if(!dbname || !expr) usage();
386 if((id = atoi(expr)) > 0) expr = NULL;
387 rv = procget(dbname, id, expr, attr);
388 return rv;
389 }
390
391
392 /* parse arguments of the list command */
393 static int runlist(int argc, char **argv){
394 char *dbname;
395 int i, rv;
396 dbname = NULL;
397 for(i = 2; i < argc; i++){
398 if(!dbname && argv[i][0] == '-'){
399 usage();
400 } else if(!dbname){
401 dbname = argv[i];
402 } else {
403 usage();
404 }
405 }
406 if(!dbname) usage();
407 rv = proclist(dbname);
408 return rv;
409 }
410
411
412 /* parse arguments of the uriid command */
413 static int runuriid(int argc, char **argv){
414 char *dbname, *uri;
415 int i, rv;
416 dbname = NULL;
417 uri = NULL;
418 for(i = 2; i < argc; i++){
419 if(!dbname && argv[i][0] == '-'){
420 usage();
421 } else if(!dbname){
422 dbname = argv[i];
423 } else if(!uri){
424 uri = argv[i];
425 } else {
426 usage();
427 }
428 }
429 if(!dbname || !uri) usage();
430 rv = procuriid(dbname, uri);
431 return rv;
432 }
433
434
435 /* parse arguments of the meta command */
436 static int runmeta(int argc, char **argv){
437 char *dbname, *mname, *mvalue;
438 int i, del, rv;
439 dbname = NULL;
440 mname = NULL;
441 mvalue = NULL;
442 del = FALSE;
443 for(i = 2; i < argc; i++){
444 if(!dbname && argv[i][0] == '-'){
445 usage();
446 } else if(!dbname){
447 dbname = argv[i];
448 } else if(!mname){
449 mname = argv[i];
450 } else if(!mvalue){
451 mvalue = argv[i];
452 } else {
453 usage();
454 }
455 }
456 if(!dbname) usage();
457 rv = procmeta(dbname, mname, mvalue);
458 return rv;
459 }
460
461
462 /* parse arguments of the inform command */
463 static int runinform(int argc, char **argv){
464 char *dbname;
465 int i, rv;
466 dbname = NULL;
467 for(i = 2; i < argc; i++){
468 if(!dbname && argv[i][0] == '-'){
469 usage();
470 } else if(!dbname){
471 dbname = argv[i];
472 } else {
473 usage();
474 }
475 }
476 if(!dbname) usage();
477 rv = procinform(dbname);
478 return rv;
479 }
480
481
482 /* parse arguments of the optimize command */
483 static int runoptimize(int argc, char **argv){
484 char *dbname;
485 int i, rv;
486 dbname = NULL;
487 for(i = 2; i < argc; i++){
488 if(!dbname && argv[i][0] == '-'){
489 if(!strcmp(argv[i], "-onp")){
490 g_optopts |= ESTOPTNOPURGE;
491 } else if(!strcmp(argv[i], "-ond")){
492 g_optopts |= ESTOPTNODBOPT;
493 } else {
494 usage();
495 }
496 } else if(!dbname){
497 dbname = argv[i];
498 } else {
499 usage();
500 }
501 }
502 if(!dbname) usage();
503 rv = procoptimize(dbname);
504 return rv;
505 }
506
507
508 /* parse arguments of the search command */
509 static int runsearch(int argc, char **argv){
510 CBDATUM *pbuf;
511 CBLIST *attrs;
512 char *dbname, *ord, *phrase, *tmp;
513 int i, max, opts, sim, rv;
514 dbname = NULL;
515 ord = NULL;
516 max = SEARCHMAX;
517 opts = 0;
518 sim = -1;
519 pbuf = cbdatumopen("", 0);
520 cbglobalgc(pbuf, (void (*)(void *))cbdatumclose);
521 attrs = cblistopen();
522 cbglobalgc(attrs, (void (*)(void *))cblistclose);
523 for(i = 2; i < argc; i++){
524 if(!dbname && argv[i][0] == '-'){
525 if(!strcmp(argv[i], "-ic")){
526 if(++i >= argc) usage();
527 g_inputcode = argv[i];
528 } else if(!strcmp(argv[i], "-gs")){
529 opts |= ESTCONDSURE;
530 } else if(!strcmp(argv[i], "-gf")){
531 opts |= ESTCONDFAST;
532 } else if(!strcmp(argv[i], "-ga")){
533 opts |= ESTCONDAGIT;
534 } else if(!strcmp(argv[i], "-ni")){
535 opts |= ESTCONDNOIDF;
536 } else if(!strcmp(argv[i], "-sf")){
537 opts |= ESTCONDSIMPLE;
538 } else if(!strcmp(argv[i], "-hs")){
539 opts |= ESTCONDSCFB;
540 } else if(!strcmp(argv[i], "-vu")){
541 g_viewmode = VM_URI;
542 } else if(!strcmp(argv[i], "-va")){
543 g_viewmode = VM_ATTR;
544 } else if(!strcmp(argv[i], "-vf")){
545 g_viewmode = VM_FULL;
546 } else if(!strcmp(argv[i], "-vs")){
547 g_viewmode = VM_SNIP;
548 } else if(!strcmp(argv[i], "-vh")){
549 g_viewmode = VM_HMRD;
550 } else if(!strcmp(argv[i], "-vx")){
551 g_viewmode = VM_XML;
552 } else if(!strcmp(argv[i], "-dd")){
553 g_viewmode = VM_DUMP;
554 } else if(!strcmp(argv[i], "-attr")){
555 if(++i >= argc) usage();
556 cblistpush(attrs, argv[i], -1);
557 } else if(!strcmp(argv[i], "-ord")){
558 if(++i >= argc) usage();
559 ord = argv[i];
560 } else if(!strcmp(argv[i], "-max")){
561 if(++i >= argc) usage();
562 max = atoi(argv[i]);
563 } else if(!strcmp(argv[i], "-sim")){
564 if(++i >= argc) usage();
565 sim = atoi(argv[i]);
566 } else {
567 usage();
568 }
569 } else if(!dbname){
570 dbname = argv[i];
571 } else {
572 if(cbdatumsize(pbuf) > 0) cbdatumcat(pbuf, " ", 1);
573 cbdatumcat(pbuf, argv[i], -1);
574 }
575 }
576 if(!dbname) usage();
577 if(!(phrase = est_iconv(cbdatumptr(pbuf), -1, g_inputcode, "UTF-8", NULL, NULL))){
578 printferror("%s: unsupported encoding\n", g_inputcode);
579 return 1;
580 }
581 cbstrtrim(phrase);
582 for(i = 0; i < cblistnum(attrs); i++){
583 if((tmp = est_iconv(cblistval(attrs, i, NULL), -1, g_inputcode, "UTF-8", NULL, NULL)) != NULL){
584 cblistover(attrs, i, tmp, -1);
585 free(tmp);
586 }
587 }
588 rv = procsearch(dbname, phrase, attrs, ord, max, opts, sim);
589 free(phrase);
590 return rv;
591 }
592
593
594 /* parse arguments of the gather command */
595 static int rungather(int argc, char **argv){
596 CBLIST *list;
597 const char *elem;
598 char *dbname, *filename;
599 int i, j, rv;
600 g_pathattrs = cblistopen();
601 cbglobalgc(g_pathattrs, (void (*)(void *))cblistclose);
602 g_xcmdmap = cbmapopenex(MINIBNUM);
603 cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose);
604 dbname = NULL;
605 filename = NULL;
606 g_inputcode = NULL;
607 for(i = 2; i < argc; i++){
608 if(!dbname && argv[i][0] == '-'){
609 if(!strcmp(argv[i], "-cl")){
610 g_putopts |= ESTPDCLEAN;
611 } else if(!strcmp(argv[i], "-fe")){
612 g_filefmt = FF_DRAFT;
613 } else if(!strcmp(argv[i], "-ft")){
614 g_filefmt = FF_TEXT;
615 } else if(!strcmp(argv[i], "-fh")){
616 g_filefmt = FF_HTML;
617 } else if(!strcmp(argv[i], "-fm")){
618 g_filefmt = FF_MIME;
619 } else if(!strcmp(argv[i], "-fx")){
620 if((i += 2) >= argc) usage();
621 list = cbsplit(argv[i-1], -1, ",");
622 for(j = 0; j < cblistnum(list); j++){
623 elem = cblistval(list, j, NULL);
624 if(elem[0] != '\0') cbmapput(g_xcmdmap, elem, -1, argv[i], -1, FALSE);
625 }
626 cblistclose(list);
627 } else if(!strcmp(argv[i], "-fz")){
628 g_filefmt = FF_NONE;
629 } else if(!strcmp(argv[i], "-fo")){
630 g_filtorig = TRUE;
631 } else if(!strcmp(argv[i], "-ic")){
632 if(++i >= argc) usage();
633 g_inputcode = argv[i];
634 } else if(!strcmp(argv[i], "-il")){
635 if(++i >= argc) usage();
636 g_inputlang = strtolang(argv[i]);
637 } else if(!strcmp(argv[i], "-pc")){
638 if(++i >= argc) usage();
639 g_pathcode = argv[i];
640 } else if(!strcmp(argv[i], "-pf")){
641 g_pathfull = TRUE;
642 } else if(!strcmp(argv[i], "-px")){
643 if(++i >= argc) usage();
644 cblistpush(g_pathattrs, argv[i], -1);
645 } else if(!strcmp(argv[i], "-apn")){
646 g_oextmodes |= ESTDBPERFNG;
647 } else if(!strcmp(argv[i], "-sd")){
648 g_stdate = TRUE;
649 } else if(!strcmp(argv[i], "-cm")){
650 g_chkmdate = TRUE;
651 } else if(!strcmp(argv[i], "-cs")){
652 if(++i >= argc) usage();
653 g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
654 } else {
655 usage();
656 }
657 } else if(!dbname){
658 dbname = argv[i];
659 } else if(!filename){
660 filename = argv[i];
661 } else {
662 usage();
663 }
664 }
665 if(!dbname || !filename) usage();
666 rv = procgather(dbname, filename);
667 return rv;
668 }
669
670
671 /* parse arguments of the purge command */
672 static int runpurge(int argc, char **argv){
673 char *dbname, *prefix;
674 int i, rv;
675 dbname = NULL;
676 prefix = NULL;
677 for(i = 2; i < argc; i++){
678 if(!dbname && argv[i][0] == '-'){
679 if(!strcmp(argv[i], "-cl")){
680 g_outopts |= ESTODCLEAN;
681 } else if(!strcmp(argv[i], "-fc")){
682 g_doforce = TRUE;
683 } else {
684 usage();
685 }
686 } else if(!dbname){
687 dbname = argv[i];
688 } else if(!prefix){
689 prefix = argv[i];
690 } else {
691 usage();
692 }
693 }
694 if(!dbname) usage();
695 rv = procpurge(dbname, prefix);
696 return rv;
697 }
698
699
700 /* parse arguments of the extkeys command */
701 static int runextkeys(int argc, char **argv){
702 char *dbname, *prefix;
703 int i, ni, rv;
704 dbname = NULL;
705 prefix = NULL;
706 ni = FALSE;
707 for(i = 2; i < argc; i++){
708 if(!dbname && argv[i][0] == '-'){
709 if(!strcmp(argv[i], "-fc")){
710 g_doforce = TRUE;
711 } else if(!strcmp(argv[i], "-ni")){
712 ni = TRUE;
713 } else if(!strcmp(argv[i], "-kn")){
714 if(++i >= argc) usage();
715 g_kwordnum = atoi(argv[i]);
716 } else {
717 usage();
718 }
719 } else if(!dbname){
720 dbname = argv[i];
721 } else if(!prefix){
722 prefix = argv[i];
723 } else {
724 usage();
725 }
726 }
727 if(!dbname || g_kwordnum < 1) usage();
728 rv = procextkeys(dbname, prefix, ni);
729 return rv;
730 }
731
732
733 /* parse arguments of the draft command */
734 static int rundraft(int argc, char **argv){
735 char *filename;
736 int i, rv;
737 filename = NULL;
738 g_filefmt = FF_DRAFT;
739 g_inputcode = NULL;
740 for(i = 2; i < argc; i++){
741 if(!filename && argv[i][0] == '-'){
742 if(!strcmp(argv[i], "-ft")){
743 g_filefmt = FF_TEXT;
744 } else if(!strcmp(argv[i], "-fh")){
745 g_filefmt = FF_HTML;
746 } else if(!strcmp(argv[i], "-fm")){
747 g_filefmt = FF_MIME;
748 } else if(!strcmp(argv[i], "-ic")){
749 if(++i >= argc) usage();
750 g_inputcode = argv[i];
751 } else if(!strcmp(argv[i], "-il")){
752 if(++i >= argc) usage();
753 g_inputlang = strtolang(argv[i]);
754 } else {
755 usage();
756 }
757 } else if(!filename){
758 filename = argv[i];
759 } else {
760 usage();
761 }
762 }
763 rv = procdraft(filename);
764 return rv;
765 }
766
767
768 /* parse arguments of the break command */
769 static int runbreak(int argc, char **argv){
770 char *filename;
771 int i, wt, rv;
772 filename = NULL;
773 wt = FALSE;
774 for(i = 2; i < argc; i++){
775 if(!filename && argv[i][0] == '-'){
776 if(!strcmp(argv[i], "-ic")){
777 if(++i >= argc) usage();
778 g_inputcode = argv[i];
779 } else if(!strcmp(argv[i], "-il")){
780 if(++i >= argc) usage();
781 g_inputlang = strtolang(argv[i]);
782 } else if(!strcmp(argv[i], "-apn")){
783 g_oextmodes |= ESTDBPERFNG;
784 } else if(!strcmp(argv[i], "-wt")){
785 wt = TRUE;
786 } else {
787 usage();
788 }
789 } else if(!filename){
790 filename = argv[i];
791 } else {
792 usage();
793 }
794 }
795 rv = procbreak(filename, wt);
796 return rv;
797 }
798
799
800 /* parse arguments of the randput command */
801 static int runrandput(int argc, char **argv){
802 char *dbname, *dnstr;
803 int i, dnum, rv;
804 dbname = NULL;
805 dnstr = NULL;
806 for(i = 2; i < argc; i++){
807 if(!dbname && argv[i][0] == '-'){
808 if(!strcmp(argv[i], "-ren")){
809 g_rdmode = RD_ENG;
810 } else if(!strcmp(argv[i], "-rla")){
811 g_rdmode = RD_LAT;
812 } else if(!strcmp(argv[i], "-reu")){
813 g_rdmode = RD_EURO;
814 } else if(!strcmp(argv[i], "-ror")){
815 g_rdmode = RD_ORI;
816 } else if(!strcmp(argv[i], "-rjp")){
817 g_rdmode = RD_JPN;
818 } else if(!strcmp(argv[i], "-rch")){
819 g_rdmode = RD_CHAO;
820 } else if(!strcmp(argv[i], "-cs")){
821 if(++i >= argc) usage();
822 g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
823 } else {
824 usage();
825 }
826 } else if(!dbname){
827 dbname = argv[i];
828 } else if(!dnstr){
829 dnstr = argv[i];
830 } else {
831 usage();
832 }
833 }
834 if(!dbname || !dnstr) usage();
835 if((dnum = atoi(dnstr)) < 1) usage();
836 rv = procrandput(dbname, dnum);
837 return rv;
838 }
839
840
841 /* parse arguments of the wicked command */
842 static int runwicked(int argc, char **argv){
843 char *dbname, *dnstr;
844 int i, dnum, rv;
845 dbname = NULL;
846 dnstr = NULL;
847 for(i = 2; i < argc; i++){
848 if(!dbname && argv[i][0] == '-'){
849 usage();
850 } else if(!dbname){
851 dbname = argv[i];
852 } else if(!dnstr){
853 dnstr = argv[i];
854 } else {
855 usage();
856 }
857 }
858 if(!dbname || !dnstr) usage();
859 if((dnum = atoi(dnstr)) < 1) usage();
860 rv = procwicked(dbname, dnum);
861 return rv;
862 }
863
864
865 /* parse arguments of the regression command */
866 static int runregression(int argc, char **argv){
867 char *dbname;
868 int i, rv;
869 dbname = NULL;
870 for(i = 2; i < argc; i++){
871 if(!dbname && argv[i][0] == '-'){
872 usage();
873 } else if(!dbname){
874 dbname = argv[i];
875 } else {
876 usage();
877 }
878 }
879 if(!dbname) usage();
880 rv = procregression(dbname);
881 return rv;
882 }
883
884
885 /* perform the put command */
886 static int procput(const char *dbname, const char *filename){
887 ESTDB *db;
888 ESTDOC *doc;
889 const char *uri;
890 char *draft;
891 int ecode;
892 if(!(draft = cbreadfile(filename, NULL))){
893 printferror("%s: could not open", filename ? filename : "(stdin)");
894 return 1;
895 }
896 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT, &ecode))){
897 printferror("%s: %s", dbname, est_err_msg(ecode));
898 free(draft);
899 return 1;
900 }
901 est_db_set_informer(db, dbinform);
902 doc = est_doc_new_from_draft(draft);
903 if(!est_db_put_doc(db, doc, g_putopts)){
904 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
905 est_doc_delete(doc);
906 est_db_close(db, &ecode);
907 free(draft);
908 return 1;
909 }
910 if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
911 printfinfo("%d (%s): registered", est_doc_id(doc), uri);
912 est_doc_delete(doc);
913 if(!est_db_close(db, &ecode)){
914 printferror("%s: %s", dbname, est_err_msg(ecode));
915 free(draft);
916 return 1;
917 }
918 free(draft);
919 return 0;
920 }
921
922
923 /* perform the out command */
924 static int procout(const char *dbname, int id, const char *expr){
925 ESTDB *db;
926 int ecode;
927 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
928 printferror("%s: %s", dbname, est_err_msg(ecode));
929 return 1;
930 }
931 est_db_set_informer(db, dbinform);
932 if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
933 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
934 est_db_close(db, &ecode);
935 return 1;
936 }
937 if(!est_db_out_doc(db, id, g_outopts)){
938 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
939 est_db_close(db, &ecode);
940 return 1;
941 }
942 printfinfo("%d: deleted", id);
943 if(!est_db_close(db, &ecode)){
944 printferror("%s: %s", dbname, est_err_msg(ecode));
945 return 1;
946 }
947 return 0;
948 }
949
950
951 /* perform the get command */
952 static int procget(const char *dbname, int id, const char *expr, const char *attr){
953 ESTDB *db;
954 ESTDOC *doc;
955 char *draft;
956 int ecode;
957 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
958 printferror("%s: %s", dbname, est_err_msg(ecode));
959 return 1;
960 }
961 if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
962 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
963 est_db_close(db, &ecode);
964 return 1;
965 }
966 if(attr){
967 if(!(draft = est_db_get_doc_attr(db, id, attr))){
968 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
969 est_db_close(db, &ecode);
970 return 1;
971 }
972 printf("%s\n", draft);
973 free(draft);
974 } else {
975 if(!(doc = est_db_get_doc(db, id, 0))){
976 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
977 est_db_close(db, &ecode);
978 return 1;
979 }
980 draft = est_doc_dump_draft(doc);
981 printf("%s", draft);
982 free(draft);
983 est_doc_delete(doc);
984 }
985 if(!est_db_close(db, &ecode)){
986 printferror("%s: %s", dbname, est_err_msg(ecode));
987 return 1;
988 }
989 return 0;
990 }
991
992
993 /* perform the list command */
994 static int proclist(const char *dbname){
995 ESTDB *db;
996 ESTDOC *doc;
997 const char *vbuf;
998 int ecode, id;
999 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1000 printferror("%s: %s", dbname, est_err_msg(ecode));
1001 return 1;
1002 }
1003 if(!est_db_iter_init(db)){
1004 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1005 est_db_close(db, &ecode);
1006 return 1;
1007 }
1008 while((id = est_db_iter_next(db)) > 0){
1009 if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1010 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1011 printf("%d\t%s\n", id, vbuf);
1012 est_doc_delete(doc);
1013 }
1014 }
1015 if(!est_db_close(db, &ecode)){
1016 printferror("%s: %s", dbname, est_err_msg(ecode));
1017 return 1;
1018 }
1019 return 0;
1020 }
1021
1022
1023 /* perform the uriid command */
1024 static int procuriid(const char *dbname, const char *uri){
1025 ESTDB *db;
1026 int ecode, id;
1027 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1028 printferror("%s: %s", dbname, est_err_msg(ecode));
1029 return 1;
1030 }
1031 if((id = est_db_uri_to_id(db, uri)) == -1){
1032 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1033 est_db_close(db, &ecode);
1034 return 1;
1035 }
1036 printf("%d\n", id);
1037 if(!est_db_close(db, &ecode)){
1038 printferror("%s: %s", dbname, est_err_msg(ecode));
1039 return 1;
1040 }
1041 return 0;
1042 }
1043
1044
1045 /* perform the meta command */
1046 static int procmeta(const char *dbname, const char *mname, const char *mvalue){
1047 ESTDB *db;
1048 CBLIST *names;
1049 char *vbuf;
1050 int i, ecode;
1051 if(!(db = est_db_open(dbname, mvalue ? (ESTDBWRITER | ESTDBCREAT) : (ESTDBREADER | ESTDBLCKNB),
1052 &ecode))){
1053 printferror("%s: %s", dbname, est_err_msg(ecode));
1054 return 1;
1055 }
1056 if(mname){
1057 if(mvalue){
1058 est_db_add_meta(db, mname, mvalue[0] != '\0' ? mvalue : NULL);
1059 } else {
1060 if((vbuf = est_db_meta(db, mname)) != NULL){
1061 printf("%s\n", vbuf);
1062 free(vbuf);
1063 }
1064 }
1065 } else {
1066 names = est_db_meta_names(db);
1067 for(i = 0; i < cblistnum(names); i++){
1068 printf("%s\n", cblistval(names, i, NULL));
1069 }
1070 cblistclose(names);
1071 }
1072 if(!est_db_close(db, &ecode)){
1073 printferror("%s: %s", dbname, est_err_msg(ecode));
1074 return 1;
1075 }
1076 return 0;
1077 }
1078
1079
1080 /* perform the inform command */
1081 static int procinform(const char *dbname){
1082 ESTDB *db;
1083 int ecode;
1084 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1085 printferror("%s: %s", dbname, est_err_msg(ecode));
1086 return 1;
1087 }
1088 printf("number of documents: %d\n", est_db_doc_num(db));
1089 printf("number of words: %d\n", est_db_word_num(db));
1090 printf("file size: %.0f\n", est_db_size(db));
1091 if(!est_db_close(db, &ecode)){
1092 printferror("%s: %s", dbname, est_err_msg(ecode));
1093 return 1;
1094 }
1095 return 0;
1096 }
1097
1098
1099 /* perform the optimize command */
1100 static int procoptimize(const char *dbname){
1101 ESTDB *db;
1102 char path[URIBUFSIZ];
1103 int ecode;
1104 time_t curtime;
1105 curtime = time(NULL);
1106 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1107 printferror("%s: %s", dbname, est_err_msg(ecode));
1108 return 1;
1109 }
1110 est_db_set_informer(db, dbinform);
1111 sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1112 unlink(path);
1113 if(!est_db_optimize(db, g_optopts)){
1114 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1115 est_db_close(db, &ecode);
1116 return 1;
1117 }
1118 if(!est_db_close(db, &ecode)){
1119 printferror("%s: %s", dbname, est_err_msg(ecode));
1120 return 1;
1121 }
1122 curtime = time(NULL) - curtime;
1123 printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1124 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1125 return 0;
1126 }
1127
1128
1129 /* perform the search command */
1130 static int procsearch(const char *dbname, const char *phrase,
1131 const CBLIST *attrs, const char *ord, int max, int opts, int sim){
1132 ESTDB *db;
1133 ESTCOND *cond;
1134 ESTDOC *doc;
1135 CURIA *kwdb;
1136 CBDATUM *pbuf;
1137 CBMAP *svmap, *hints, *kwords;
1138 CBLIST *names, *words, *lines;
1139 const char *kbuf, *vbuf, *line;
1140 char *draft, path[URIBUFSIZ], numbuf[NUMBUFSIZ], *word, *pv;
1141 int i, j, ecode, ksiz, vsiz, *res, rnum, id, sc, fin, cnt;
1142 double curtime;
1143 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1144 printferror("%s: %s", dbname, est_err_msg(ecode));
1145 return 1;
1146 }
1147 sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1148 if((kwdb = cropen(path, CR_OREADER, -1, -1)) != NULL)
1149 est_db_set_vectorizer(db, vectorizer, kwdb);
1150 cond = est_cond_new();
1151 if(sim > 0){
1152 svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL;
1153 if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){
1154 svmap = est_db_etch_doc((opts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM);
1155 est_doc_delete(doc);
1156 }
1157 if(svmap){
1158 pbuf = cbdatumopen(ESTOPSIMILAR, -1);
1159 cbmapiterinit(svmap);
1160 while((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
1161 vbuf = cbmapget(svmap, kbuf, ksiz, &vsiz);
1162 cbdatumcat(pbuf, " WITH ", -1);
1163 cbdatumcat(pbuf, vbuf, vsiz);
1164 cbdatumcat(pbuf, " ", 1);
1165 cbdatumcat(pbuf, kbuf, ksiz);
1166 }
1167 est_cond_set_phrase(cond, cbdatumptr(pbuf));
1168 cbdatumclose(pbuf);
1169 cbmapclose(svmap);
1170 }
1171 } else {
1172 while(*phrase > '\0' && *phrase <= ' '){
1173 phrase++;
1174 }
1175 if(phrase[0] != '\0' || cblistnum(attrs) < 1) est_cond_set_phrase(cond, phrase);
1176 }
1177 for(i = 0; i < cblistnum(attrs); i++){
1178 est_cond_add_attr(cond, cblistval(attrs, i, NULL));
1179 }
1180 if(ord) est_cond_set_order(cond, ord);
1181 if(max >= 0) est_cond_set_max(cond, max);
1182 est_cond_set_options(cond, opts);
1183 hints = cbmapopenex(MINIBNUM);
1184 curtime = est_gettimeofday();
1185 res = est_db_search(db, cond, &rnum, hints);
1186 curtime = est_gettimeofday() - curtime;
1187 if(g_viewmode == VM_XML){
1188 xmlprintf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
1189 xmlprintf("<estresult version=\"%@\">\n", est_version);
1190 xmlprintf("<meta>\n");
1191 xmlprintf("<hit number=\"%@\"/>\n", cbmapget(hints, "", 0, NULL));
1192 cbmapiterinit(hints);
1193 while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1194 if(kbuf[0] == '\0') continue;
1195 vbuf = cbmapget(hints, kbuf, -1, NULL);
1196 xmlprintf("<hit key=\"%@\" number=\"%@\"/>\n", kbuf, vbuf);
1197 }
1198 xmlprintf("<time time=\"%.3f\"/>\n", curtime / 1000.0);
1199 xmlprintf("<total documents=\"%d\" words=\"%d\"/>\n",
1200 est_db_doc_num(db), est_db_word_num(db));
1201 xmlprintf("</meta>\n");
1202 } else {
1203 printf("%s\n", est_border_str());
1204 printf("VERSION\t%s\n", _EST_PROTVER);
1205 printf("NODE\tlocal\n");
1206 printf("HIT\t%s\n", cbmapget(hints, "", 0, NULL));
1207 cbmapiterinit(hints);
1208 cnt = 1;
1209 while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1210 if(kbuf[0] == '\0') continue;
1211 vbuf = cbmapget(hints, kbuf, -1, NULL);
1212 printf("HINT#%d\t%s\t%s\n", cnt, kbuf, vbuf);
1213 cnt++;
1214 }
1215 printf("TIME\t%.3f\n", curtime / 1000.0);
1216 printf("DOCNUM\t%d\n", est_db_doc_num(db));
1217 printf("WORDNUM\t%d\n", est_db_word_num(db));
1218 switch(g_viewmode){
1219 case VM_ID:
1220 printf("VIEW\tID\n");
1221 break;
1222 case VM_URI:
1223 printf("VIEW\tURI\n");
1224 break;
1225 case VM_ATTR:
1226 printf("VIEW\tATTRIBUTE\n");
1227 break;
1228 case VM_FULL:
1229 printf("VIEW\tFULL\n");
1230 break;
1231 case VM_SNIP:
1232 printf("VIEW\tSNIPPET\n");
1233 break;
1234 case VM_HMRD:
1235 printf("VIEW\tHUMAN\n");
1236 break;
1237 }
1238 printf("\n");
1239 if(g_viewmode == VM_ID || g_viewmode == VM_URI ||
1240 g_viewmode == VM_HMRD || g_viewmode == VM_DUMP) printf("%s\n", est_border_str());
1241 }
1242 for(i = 0; i < rnum ; i++){
1243 id = res[i];
1244 sc = est_cond_score(cond, i);
1245 switch(g_viewmode){
1246 case VM_URI:
1247 if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1248 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1249 printf("%d\t%s\n", id, vbuf);
1250 est_doc_delete(doc);
1251 }
1252 break;
1253 case VM_ATTR:
1254 if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1255 if(sc >= 0){
1256 sprintf(numbuf, "%d", sc);
1257 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1258 }
1259 printf("%s\n", est_border_str());
1260 names = est_doc_attr_names(doc);
1261 for(j = 0; j < cblistnum(names); j++){
1262 kbuf = cblistval(names, j, NULL);
1263 vbuf = est_doc_attr(doc, kbuf);
1264 printf("%s=%s\n", kbuf, vbuf);
1265 }
1266 cblistclose(names);
1267 est_doc_delete(doc);
1268 }
1269 printf("\n");
1270 break;
1271 case VM_FULL:
1272 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1273 if(sc >= 0){
1274 sprintf(numbuf, "%d", sc);
1275 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1276 }
1277 printf("%s\n", est_border_str());
1278 draft = est_doc_dump_draft(doc);
1279 printf("%s", draft);
1280 free(draft);
1281 est_doc_delete(doc);
1282 }
1283 break;
1284 case VM_SNIP:
1285 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1286 if(sc >= 0){
1287 sprintf(numbuf, "%d", sc);
1288 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1289 }
1290 printf("%s\n", est_border_str());
1291 names = est_doc_attr_names(doc);
1292 for(j = 0; j < cblistnum(names); j++){
1293 kbuf = cblistval(names, j, NULL);
1294 vbuf = est_doc_attr(doc, kbuf);
1295 printf("%s=%s\n", kbuf, vbuf);
1296 }
1297 cblistclose(names);
1298 kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1299 if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1300 if(cbmaprnum(kwords) > 0){
1301 printf("%s=", DATTRKWORDS);
1302 cbmapiterinit(kwords);
1303 for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1304 if(j > 0) printf(" ");
1305 printf("%s %s", kbuf, cbmapget(kwords, kbuf, -1, NULL));
1306 }
1307 printf("\n");
1308 }
1309 cbmapclose(kwords);
1310 printf("\n");
1311 words = cbmapkeys(hints);
1312 draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1313 printf("%s", draft);
1314 free(draft);
1315 cblistclose(words);
1316 est_doc_delete(doc);
1317 }
1318 break;
1319 case VM_HMRD:
1320 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1321 if(sc >= 0){
1322 sprintf(numbuf, "%d", sc);
1323 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1324 }
1325 printf("\n");
1326 if((vbuf = est_doc_attr(doc, ESTDATTRURI)) != NULL) printf("URI: %s\n", vbuf);
1327 if((vbuf = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) printf("Title: %s\n", vbuf);
1328 printf(" ");
1329 words = cbmapkeys(hints);
1330 draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1331 lines = cbsplit(draft, -1, "\n");
1332 fin = TRUE;
1333 for(j = 0; j < cblistnum(lines); j++){
1334 line = cblistval(lines, j, NULL);
1335 if(line[0] != '\0'){
1336 word = cbmemdup(line, -1);
1337 if((pv = strchr(word, '\t')) != NULL) *pv = '\0';
1338 printf("%s", word);
1339 free(word);
1340 fin = TRUE;
1341 } else if(fin){
1342 printf(" ... ");
1343 fin = FALSE;
1344 }
1345 }
1346 cblistclose(lines);
1347 free(draft);
1348 cblistclose(words);
1349 printf("\n\n");
1350 est_doc_delete(doc);
1351 }
1352 break;
1353 case VM_XML:
1354 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1355 if(sc >= 0){
1356 sprintf(numbuf, "%d", sc);
1357 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1358 }
1359 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1360 xmlprintf("<document id=\"%d\" uri=\"%@\">\n", id, vbuf);
1361 names = est_doc_attr_names(doc);
1362 for(j = 0; j < cblistnum(names); j++){
1363 kbuf = cblistval(names, j, NULL);
1364 if(!strcmp(kbuf, ESTDATTRID) || !strcmp(kbuf, ESTDATTRURI)) continue;
1365 vbuf = est_doc_attr(doc, kbuf);
1366 xmlprintf("<attribute name=\"%@\" value=\"%@\"/>\n", kbuf, vbuf);
1367 }
1368 cblistclose(names);
1369 kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1370 if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1371 if(cbmaprnum(kwords) > 0){
1372 xmlprintf("<vector>");
1373 cbmapiterinit(kwords);
1374 for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1375 xmlprintf("<element key=\"%@\" number=\"%@\"/>",
1376 kbuf, cbmapget(kwords, kbuf, -1, NULL));
1377 }
1378 xmlprintf("</vector>\n");
1379 }
1380 cbmapclose(kwords);
1381 words = cbmapkeys(hints);
1382 draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1383 lines = cbsplit(draft, -1, "\n");
1384 fin = TRUE;
1385 xmlprintf("<snippet>");
1386 for(j = 0; j < cblistnum(lines); j++){
1387 line = cblistval(lines, j, NULL);
1388 if(line[0] != '\0'){
1389 word = cbmemdup(line, -1);
1390 if((pv = strchr(word, '\t')) != NULL){
1391 *pv = '\0';
1392 pv++;
1393 xmlprintf("<key normal=\"%@\">%@</key>", pv, word);
1394 } else {
1395 xmlprintf("%@", word);
1396 }
1397 free(word);
1398 fin = TRUE;
1399 } else if(fin){
1400 xmlprintf("<delimiter/>");
1401 fin = FALSE;
1402 }
1403 }
1404 xmlprintf("</snippet>\n");
1405 cblistclose(lines);
1406 free(draft);
1407 cblistclose(words);
1408 xmlprintf("</document>\n");
1409 est_doc_delete(doc);
1410 }
1411 break;
1412 case VM_DUMP:
1413 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1414 if(sc >= 0){
1415 sprintf(numbuf, "%d", sc);
1416 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1417 }
1418 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1419 sprintf(path, "%08d%cest", id, ESTEXTCHR);
1420 printf("%s\t%s\n", path, vbuf);
1421 draft = est_doc_dump_draft(doc);
1422 if(!(cbwritefile(path, draft, -1))) printferror("%s: could not open", path);
1423 free(draft);
1424 est_doc_delete(doc);
1425 }
1426 break;
1427 default:
1428 printf("%d\n", id);
1429 break;
1430 }
1431 }
1432 if(g_viewmode == VM_XML){
1433 xmlprintf("</estresult>\n");
1434 } else {
1435 printf("%s:END\n", est_border_str());
1436 }
1437 free(res);
1438 cbmapclose(hints);
1439 est_cond_delete(cond);
1440 if(kwdb) crclose(kwdb);
1441 if(!est_db_close(db, &ecode)){
1442 printferror("%s: %s", dbname, est_err_msg(ecode));
1443 return 1;
1444 }
1445 return 0;
1446 }
1447
1448
1449 /* perform the gather command */
1450 static int procgather(const char *dbname, const char *filename){
1451 ESTDB *db;
1452 CBLIST *list, *clist, *attrs;
1453 FILE *ifp;
1454 const char *tmp;
1455 char *line, *path;
1456 int i, err, ecode;
1457 time_t curtime;
1458 struct stat sbuf;
1459 curtime = time(NULL);
1460 err = FALSE;
1461 if(stat(filename, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)){
1462 printfinfo("reading list from the directory: %s", filename);
1463 if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1464 est_db_set_informer(db, dbinform);
1465 if(g_cachesize > 0){
1466 if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1467 est_db_set_cache_size(db, g_cachesize, -1, -1);
1468 }
1469 list = cblistopen();
1470 cblistunshift(list, filename, -1);
1471 while((line = cblistshift(list, NULL)) != NULL){
1472 if(stat(line, &sbuf) != -1 && S_ISDIR(sbuf.st_mode) && (clist = cbdirlist(line)) != NULL){
1473 cblistsort(clist);
1474 for(i = cblistnum(clist) - 1; i >= 0; i--){
1475 tmp = cblistval(clist, i, NULL);
1476 if(!strcmp(tmp, ESTCDIRSTR) || !strcmp(tmp, ESTPDIRSTR)) continue;
1477 path = cbsprintf("%s%c%s", line, ESTPATHCHR, tmp);
1478 cblistunshift(list, path, -1);
1479 free(path);
1480 }
1481 cblistclose(clist);
1482 } else {
1483 if(!doputdoc(db, line, NULL)){
1484 printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1485 err = TRUE;
1486 }
1487 }
1488 free(line);
1489 if(err || g_sigterm) break;
1490 }
1491 cblistclose(list);
1492 if(!est_db_close(db, &ecode)){
1493 printferror("%s: %s", dbname, est_err_msg(ecode));
1494 err = TRUE;
1495 }
1496 } else {
1497 printferror("%s: %s", dbname, est_err_msg(ecode));
1498 err = TRUE;
1499 }
1500 } else {
1501 if(!strcmp(filename, "-")){
1502 ifp = stdin;
1503 printfinfo("reading list from the standard input", filename);
1504 } else if((ifp = fopen(filename, "rb")) != NULL){
1505 printfinfo("reading list from the file: %s", filename);
1506 } else {
1507 printferror("%s: could not open", filename);
1508 return 1;
1509 }
1510 if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1511 est_db_set_informer(db, dbinform);
1512 if(g_cachesize > 0){
1513 if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1514 est_db_set_cache_size(db, g_cachesize, -1, -1);
1515 }
1516 while((line = fgetl(ifp)) != NULL){
1517 if(line[0] == '\0'){
1518 free(line);
1519 continue;
1520 }
1521 if(cblistnum(g_pathattrs) > 0){
1522 attrs = cbsplit(line, -1, "\t");
1523 path = cblistshift(attrs, NULL);
1524 if(!doputdoc(db, path, attrs)){
1525 printferror("%s: %s", path, est_err_msg(est_db_error(db)));
1526 err = TRUE;
1527 }
1528 free(path);
1529 cblistclose(attrs);
1530 } else {
1531 if(!doputdoc(db, line, NULL)){
1532 printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1533 err = TRUE;
1534 }
1535 }
1536 free(line);
1537 if(err || g_sigterm) break;
1538 }
1539 if(!est_db_close(db, &ecode)){
1540 printferror("%s: %s", dbname, est_err_msg(ecode));
1541 err = TRUE;
1542 }
1543 } else {
1544 printferror("%s: %s", dbname, est_err_msg(ecode));
1545 err = TRUE;
1546 }
1547 if(ifp != stdin) fclose(ifp);
1548 }
1549 curtime = time(NULL) - curtime;
1550 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1551 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1552 return err ? 1 : 0;
1553 }
1554
1555
1556 /* perform the purge command */
1557 static int procpurge(const char *dbname, const char *prefix){
1558 ESTDB *db;
1559 ESTCOND *cond;
1560 ESTDOC *doc;
1561 const char *luri;
1562 char *attr, *path;
1563 int i, ecode, err, *res, rnum;
1564 time_t curtime;
1565 struct stat sbuf;
1566 curtime = time(NULL);
1567 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1568 printferror("%s: %s", dbname, est_err_msg(ecode));
1569 return 1;
1570 }
1571 est_db_set_informer(db, dbinform);
1572 cond = est_cond_new();
1573 attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1574 est_cond_add_attr(cond, attr);
1575 res = est_db_search(db, cond, &rnum, NULL);
1576 err = FALSE;
1577 for(i = 0; i < rnum; i++){
1578 if(!(doc = est_db_get_doc(db, res[i], ESTGDNOTEXT))) continue;
1579 if((luri = est_doc_attr(doc, DATTRLPATH)) != NULL){
1580 if(g_doforce){
1581 if(est_db_out_doc(db, res[i], g_outopts)){
1582 printfinfo("%d (%s): deleted", res[i], luri);
1583 } else {
1584 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1585 err = TRUE;
1586 }
1587 } else if((path = urltopath(luri)) != NULL){
1588 if(stat(path, &sbuf) != -1){
1589 printfinfo("%s: passed", luri);
1590 } else {
1591 if(est_db_out_doc(db, res[i], g_outopts)){
1592 printfinfo("%d (%s): deleted", res[i], luri);
1593 } else {
1594 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1595 err = TRUE;
1596 }
1597 }
1598 } else {
1599 printfinfo("%s: ignored", luri);
1600 }
1601 } else {
1602 printfinfo("(%d): ignored", res[i]);
1603 }
1604 est_doc_delete(doc);
1605 if(err || g_sigterm) break;
1606 }
1607 free(res);
1608 est_cond_delete(cond);
1609 free(attr);
1610 if(!est_db_close(db, &ecode)){
1611 printferror("%s: %s", dbname, est_err_msg(ecode));
1612 return 1;
1613 }
1614 curtime = time(NULL) - curtime;
1615 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1616 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1617 return err ? 1 : 0;
1618 }
1619
1620
1621 /* perform the extkeys command */
1622 static int procextkeys(const char *dbname, const char *prefix, int ni){
1623 ESTDB *db;
1624 ESTCOND *cond;
1625 ESTDOC *doc;
1626 CURIA *kwdb;
1627 CBMAP *kwords;
1628 const char *uri;
1629 char path[URIBUFSIZ], *attr, *mbuf;
1630 int i, ecode, err, *res, rnum, msiz;
1631 time_t curtime;
1632 curtime = time(NULL);
1633 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1634 printferror("%s: %s", dbname, est_err_msg(ecode));
1635 return 1;
1636 }
1637 est_db_set_informer(db, dbinform);
1638 if(!ni && (!prefix || prefix[0] == '\0')) est_db_fill_key_cache(db);
1639 sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1640 if(!(kwdb = cropen(path, CR_OWRITER | CR_OCREAT, KWDBBNUM, KWDBDNUM))){
1641 printferror("%s: the keyword database has some errors", dbname);
1642 est_db_close(db, &ecode);
1643 return 1;
1644 }
1645 crsetalign(kwdb, -4);
1646 cond = est_cond_new();
1647 attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1648 est_cond_add_attr(cond, attr);
1649 res = est_db_search(db, cond, &rnum, NULL);
1650 err = FALSE;
1651 for(i = 0; i < rnum; i++){
1652 if(!g_doforce && crvsiz(kwdb, (char *)&(res[i]), sizeof(int)) > 0){
1653 printfinfo("%d: passed", res[i]);
1654 continue;
1655 }
1656 if(!(doc = est_db_get_doc(db, res[i], 0))) continue;
1657 if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
1658 kwords = est_db_etch_doc(ni ? NULL : db, doc, g_kwordnum);
1659 mbuf = cbmapdump(kwords, &msiz);
1660 fflush(stdout);
1661 if(crput(kwdb, (char *)&(res[i]), sizeof(int), mbuf, msiz, CR_DOVER)){
1662 printfinfo("%d (%s): extracted", res[i], uri);
1663 } else {
1664 printferror("%s: the keyword database has some errors", dbname);
1665 err = TRUE;
1666 }
1667 free(mbuf);
1668 cbmapclose(kwords);
1669 est_doc_delete(doc);
1670 if(err || g_sigterm) break;
1671 }
1672 free(res);
1673 est_cond_delete(cond);
1674 free(attr);
1675 if(!crclose(kwdb)){
1676 printferror("%s: the keyword database has some errors", dbname);
1677 err = TRUE;
1678 }
1679 if(!est_db_close(db, &ecode)){
1680 printferror("%s: %s", dbname, est_err_msg(ecode));
1681 return 1;
1682 }
1683 curtime = time(NULL) - curtime;
1684 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1685 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1686 return err ? 1 : 0;
1687 }
1688
1689
1690 /* perform the draft command */
1691 static int procdraft(const char *filename){
1692 ESTDOC *doc;
1693 char *buf, *draft;
1694 int size;
1695 if(!(buf = cbreadfile(filename, &size))){
1696 printferror("%s: could not open", filename ? filename : "(stdin)");
1697 return 1;
1698 }
1699 switch(g_filefmt){
1700 case FF_TEXT:
1701 doc = est_doc_new_from_text(buf, size, g_inputcode, g_inputlang);
1702 break;
1703 case FF_HTML:
1704 doc = est_doc_new_from_html(buf, size, g_inputcode, g_inputlang);
1705 break;
1706 case FF_MIME:
1707 doc = est_doc_new_from_mime(buf, size, g_inputcode, g_inputlang);
1708 break;
1709 default:
1710 doc = est_doc_new_from_draft_enc(buf, size, g_inputcode);
1711 break;
1712 }
1713 draft = est_doc_dump_draft(doc);
1714 printf("%s", draft);
1715 free(draft);
1716 est_doc_delete(doc);
1717 free(buf);
1718 return 0;
1719 }
1720
1721
1722 /* perform the break command */
1723 static int procbreak(const char *filename, int wt){
1724 CBLIST *words;
1725 char *str, *phrase;
1726 int i;
1727 if(filename && filename[0] == '@'){
1728 str = cbmemdup(filename + 1, -1);
1729 } else if(!(str = cbreadfile(filename, NULL))){
1730 printferror("%s: could not open", filename ? filename : "(stdin)");
1731 return 1;
1732 }
1733 if(!(phrase = est_iconv(str, -1, g_inputcode, "UTF-8", NULL, NULL))){
1734 printferror("%s: unsupported encoding\n", g_inputcode);
1735 free(str);
1736 return 1;
1737 }
1738 g_inputcode = NULL;
1739 words = cblistopen();
1740 if(g_oextmodes & ESTDBPERFNG){
1741 est_break_text_perfng(phrase, words, TRUE, wt);
1742 } else {
1743 est_break_text(phrase, words, TRUE, wt);
1744 }
1745 for(i = 0; i < cblistnum(words); i++){
1746 printf("%s\n", cblistval(words, i, NULL));
1747 }
1748 cblistclose(words);
1749 free(phrase);
1750 free(str);
1751 return 0;
1752 }
1753
1754
1755 /* perform the randput command */
1756 static int procrandput(const char *dbname, int dnum){
1757 ESTDB *db;
1758 ESTDOC *doc;
1759 const char *mode;
1760 char uri[URIBUFSIZ];
1761 int i, ecode, err;
1762 time_t curtime;
1763 curtime = time(NULL);
1764 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1765 printferror("%s: %s", dbname, est_err_msg(ecode));
1766 return 1;
1767 }
1768 est_db_set_informer(db, dbinform);
1769 if(g_cachesize > 0){
1770 if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1771 est_db_set_cache_size(db, g_cachesize, -1, -1);
1772 }
1773 err = FALSE;
1774 for(i = 0; i < dnum; i++){
1775 doc = est_doc_new_from_chaos(RDOCCNUM, RDOCSNUM, g_rdmode);
1776 sprintf(uri, "file:///tmp/randput-%08d-%05d.est", i + 1, getpid());
1777 est_doc_add_attr(doc, ESTDATTRURI, uri);
1778 if(est_db_put_doc(db, doc, 0)){
1779 if(!(mode = est_doc_attr(doc, "mode"))) mode = "unknown";
1780 printfinfo("%d (%s) (%s): registered", est_doc_id(doc), uri, mode);
1781 } else {
1782 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1783 err = TRUE;
1784 }
1785 est_doc_delete(doc);
1786 if(err || g_sigterm) break;
1787 }
1788 if(!est_db_close(db, &ecode)){
1789 printferror("%s: %s", dbname, est_err_msg(ecode));
1790 return 1;
1791 }
1792 curtime = time(NULL) - curtime;
1793 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1794 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1795 return err ? 1 : 0;
1796 }
1797
1798
1799 /* perform the wicked command */
1800 static int procwicked(const char *dbname, int dnum){
1801 ESTDB *db;
1802 ESTDOC *doc;
1803 ESTCOND *cond;
1804 CBLIST *words;
1805 char uri[URIBUFSIZ], *oper, *value, *first, *second, *phrase;
1806 int i, j, ecode, err, *res, rnum;
1807 double rnd;
1808 time_t curtime;
1809 curtime = time(NULL);
1810 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1811 printferror("%s: %s", dbname, est_err_msg(ecode));
1812 return 1;
1813 }
1814 est_db_set_informer(db, dbinform);
1815 est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1816 est_db_set_special_cache(db, ESTDATTRURI, 128);
1817 err = FALSE;
1818 for(i = 0; i < dnum; i++){
1819 rnd = est_random();
1820 if((int)(rnd * INT_MAX) % dnum < 5){
1821 rnd = est_random();
1822 if(rnd < 0.3){
1823 if(!est_db_close(db, &ecode)){
1824 printferror("%s: %s", dbname, est_err_msg(ecode));
1825 return 1;
1826 }
1827 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1828 printferror("%s: %s", dbname, est_err_msg(ecode));
1829 return 1;
1830 }
1831 est_db_set_informer(db, dbinform);
1832 est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1833 est_db_set_special_cache(db, ESTDATTRURI, i / 10 + 1);
1834 } else if(rnd < 0.5){
1835 if(!est_db_optimize(db, (int)(est_random() * INT_MAX) % 2 == 0) ? ESTOPTNOPURGE : 0)
1836 err = TRUE;
1837 } else if(rnd < 0.8){
1838 if(!est_db_flush(db, 1024)) err = TRUE;
1839 } else {
1840 if(!est_db_sync(db)) err = TRUE;
1841 }
1842 } else if(rnd < 0.05){
1843 if(est_db_out_doc(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1844 ((int)(est_random() * INT_MAX) % 2 == 0) ? ESTODCLEAN : 0)){
1845 printfinfo("[%d:%d]: out", i + 1, est_db_doc_num(db));
1846 } else if(est_db_error(db) != ESTENOITEM){
1847 err = TRUE;
1848 }
1849 } else if(rnd < 0.1){
1850 if((value = est_db_get_doc_attr(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1851 ESTDATTRURI)) != NULL){
1852 printfinfo("[%d:%d]: attr: %s", i + 1, est_db_doc_num(db), value);
1853 free(value);
1854 }
1855 } else if(rnd < 0.25){
1856 rnd = est_random();
1857 if(rnd < 0.5){
1858 oper = " OR ";
1859 } else if(rnd < 0.7){
1860 oper = " AND ";
1861 } else if(rnd < 0.8){
1862 oper = " NOTAND ";
1863 } else if(rnd < 0.9){
1864 oper = " ";
1865 } else {
1866 oper = "";
1867 }
1868 first = est_random_str(5, (int)(est_random() * INT_MAX) % RD_RAND);
1869 second = est_random_str(2, (int)(est_random() * INT_MAX) % RD_RAND);
1870 phrase = cbsprintf("%s%s%s", first, oper, second);
1871 cond = est_cond_new();
1872 est_cond_set_phrase(cond, phrase);
1873 if(est_random() < 0.25) est_cond_add_attr(cond, "@uri STREW 0.est");
1874 if(est_random() < 0.25) est_cond_set_order(cond, "@uri STRD");
1875 if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDSURE | ESTCONDSCFB);
1876 if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDAGIT | ESTCONDNOIDF);
1877 res = est_db_search(db, cond, &rnum, NULL);
1878 printfinfo("[%d:%d]: search: %d hits", i + 1, est_db_doc_num(db), rnum);
1879 if(est_random() < 0.05){
1880 for(j = 0; j < rnum && j < 100; j++){
1881 if((doc = est_db_get_doc(db, res[j], 0)) != NULL){
1882 if(i % 10 == 0){
1883 free(est_doc_cat_texts(doc));
1884 free(est_doc_dump_draft(doc));
1885 words = cblistopen();
1886 cblistpush(words, "vw", -1);
1887 cblistpush(words, "xy", -1);
1888 cblistpush(words, "z", -1);
1889 free(est_doc_make_snippet(doc, words, 100, 10, 10));
1890 cblistclose(words);
1891 }
1892 est_doc_delete(doc);
1893 } else if(est_db_error(db) != ESTENOITEM){
1894 err = TRUE;
1895 }
1896 }
1897 }
1898 free(res);
1899 est_cond_delete(cond);
1900 free(phrase);
1901 free(first);
1902 free(second);
1903 } else {
1904 doc = est_doc_new_from_chaos(100, 3, est_random() < 0.5 ? RD_EURO : RD_RAND);
1905 if(est_random() < 0.2){
1906 sprintf(uri, "file:///tmp/wicked-%08d-%05d.est",
1907 (int)(est_random() * INT_MAX) % (i + 1) + 1, getpid());
1908 } else {
1909 sprintf(uri, "file:///tmp/wicked-%08d-%05d.est", i + 1, getpid());
1910 }
1911 est_doc_add_attr(doc, ESTDATTRURI, uri);
1912 if(!est_db_put_doc(db, doc, est_random() < 0.5 ? ESTPDCLEAN : 0)) err = TRUE;
1913 est_doc_delete(doc);
1914 }
1915 if(err || g_sigterm) break;
1916 }
1917 if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1918 if(!est_db_close(db, &ecode)){
1919 printferror("%s: %s", dbname, est_err_msg(ecode));
1920 return 1;
1921 }
1922 curtime = time(NULL) - curtime;
1923 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1924 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1925 return err ? 1 : 0;
1926 }
1927
1928
1929 /* perform the regression command */
1930 static int procregression(const char *dbname){
1931 ESTDB *db;
1932 ESTDOC *doc;
1933 ESTCOND *cond;
1934 int i, ecode, err, *res, rnum;
1935 time_t curtime;
1936 curtime = time(NULL);
1937 printfinfo("# opening the database");
1938 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1939 printferror("%s: %s", dbname, est_err_msg(ecode));
1940 return 1;
1941 }
1942 est_db_set_informer(db, dbinform);
1943 err = FALSE;
1944 if(!err){
1945 printfinfo("# checking registration of small documents");
1946 doc = est_doc_new();
1947 est_doc_add_attr(doc, ESTDATTRURI, "file:///small/one");
1948 est_doc_add_text(doc, "One!");
1949 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1950 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1951 est_doc_delete(doc);
1952 doc = est_doc_new();
1953 est_doc_add_attr(doc, ESTDATTRURI, "file:///small/two");
1954 est_doc_add_text(doc, "Two!!");
1955 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1956 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1957 est_doc_delete(doc);
1958 doc = est_doc_new();
1959 est_doc_add_attr(doc, ESTDATTRURI, "file:///small/three");
1960 est_doc_add_text(doc, "Three!!!");
1961 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1962 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1963 est_doc_delete(doc);
1964 doc = est_doc_new();
1965 est_doc_add_attr(doc, ESTDATTRURI, "file:///empty");
1966 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1967 est_doc_delete(doc);
1968 }
1969 if(!err){
1970 printfinfo("# checking registration of an english document");
1971 doc = est_doc_new();
1972 est_doc_add_attr(doc, ESTDATTRURI, "file:///english");
1973 est_doc_add_attr(doc, ESTDATTRTITLE, "Hyper Estraier");
1974 est_doc_add_text(doc, "% This is a displayed sentence. ;-)");
1975 est_doc_add_text(doc, "Hyper Estraier is a full-text search system for communities.");
1976 est_doc_add_text(doc, "A little suffering is good for the soul.");
1977 est_doc_add_text(doc, "They have been at a great feast of languages, and stolen the scraps.");
1978 est_doc_add_hidden_text(doc, "(Give it up, Yo! Give it up, Yo!)");
1979 est_doc_add_hidden_text(doc, "% This is a hidden sentence. :-<");
1980 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1981 est_doc_add_hidden_text(doc, "");
1982 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1983 est_doc_delete(doc);
1984 }
1985 if(!err){
1986 printfinfo("# checking registration of a japanese document");
1987 doc = est_doc_new();
1988 est_doc_add_attr(doc, ESTDATTRURI, "file:///japanese");
1989 est_doc_add_attr(doc, ESTDATTRTITLE, "\xe5\xb9\xb3\xe6\x9e\x97\xe5\xb9\xb9\xe9\x9b\x84");
1990 est_doc_add_text(doc, "\xe6\x9c\xac\xe6\x97\xa5\xe3\x81\xaf\xe6\x99\xb4\xe5\xa4\xa9\xe3"
1991 "\x81\xaa\xe3\x82\x8a\xe3\x80\x82");
1992 est_doc_add_text(doc, "\xe6\x9c\x95\xe3\x81\xaf\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4"
1993 "\xb8\x80\xe3\x81\xae\xe4\xb8\x8b\xe5\x83\x95\xe3\x81\xa7\xe3\x81"
1994 "\x82\xe3\x82\x8b\xe3\x80\x82");
1995 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1996 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1997 est_doc_delete(doc);
1998 }
1999 if(!err){
2000 printfinfo("# checking duplication of documents");
2001 doc = est_doc_new();
2002 est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
2003 est_doc_add_text(doc, "Gamble, you gatta chance to make a Rumble!");
2004 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
2005 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
2006 est_doc_delete(doc);
2007 doc = est_doc_new();
2008 est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
2009 est_doc_add_text(doc, "bring back hey, one more time!");
2010 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
2011 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
2012 est_doc_delete(doc);
2013 if(est_db_doc_num(db) != 7){
2014 printferror("%s: the number of documents is invalid", dbname);
2015 err = TRUE;
2016 }
2017 }
2018 if(!err){
2019 printfinfo("# checking search for unfixed documents");
2020 cond = est_cond_new();
2021 est_cond_set_phrase(cond, "check");
2022 res = est_db_search(db, cond, &rnum, NULL);
2023 if(rnum != 6){
2024 printferror("%s: the number of result is invalid", dbname);
2025 err = TRUE;
2026 }
2027 free(res);
2028 est_cond_delete(cond);
2029 }
2030 if(!err){
2031 printfinfo("# checking partial flushing of the index");
2032 if(!est_db_flush(db, 32)) err = TRUE;
2033 }
2034 if(!err){
2035 printfinfo("# checking deletion with cleaning of a document");
2036 if(!est_db_out_doc(db, 1, ESTODCLEAN)) err = TRUE;
2037 }
2038 if(!err){
2039 printfinfo("# checking synchronization");
2040 if(!est_db_sync(db)) err = TRUE;
2041 }
2042 if(!err){
2043 printfinfo("# checking deletion without cleaning of a document");
2044 if(!est_db_out_doc(db, 2, 0)) err = TRUE;
2045 }
2046 if(!err){
2047 printfinfo("# checking word search");
2048 cond = est_cond_new();
2049 est_cond_set_phrase(cond, "check it AND on");
2050 res = est_db_search(db, cond, &rnum, NULL);
2051 if(rnum != 5){
2052 printferror("%s: the number of result is invalid", dbname);
2053 err = TRUE;
2054 }
2055 free(res);
2056 est_cond_set_phrase(cond, "RUMBLE OR \xe3\x80\x82");
2057 res = est_db_search(db, cond, &rnum, NULL);
2058 if(rnum != 1){
2059 printferror("%s: the number of result is invalid", dbname);
2060 err = TRUE;
2061 }
2062 free(res);
2063 est_cond_delete(cond);
2064 }
2065 if(!err){
2066 printfinfo("# checking attribute search");
2067 cond = est_cond_new();
2068 est_cond_add_attr(cond, "@uri !ISTRINC SMaLl");
2069 res = est_db_search(db, cond, &rnum, NULL);
2070 if(rnum != est_db_doc_num(db) - 1){
2071 printferror("%s: the number of result is invalid", dbname);
2072 err = TRUE;
2073 }
2074 free(res);
2075 est_cond_delete(cond);
2076 cond = est_cond_new();
2077 est_cond_add_attr(cond, "@uri STRBW file://");
2078 est_cond_add_attr(cond, "@title STRINC \xe5\xb9\xb3");
2079 res = est_db_search(db, cond, &rnum, NULL);
2080 if(rnum != 1){
2081 printferror("%s: the number of result is invalid", dbname);
2082 err = TRUE;
2083 }
2084 free(res);
2085 est_cond_delete(cond);
2086 }
2087 if(!err){
2088 printfinfo("# checking combined search");
2089 cond = est_cond_new();
2090 est_cond_set_phrase(cond, "\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4\xb8\x80");
2091 est_cond_add_attr(cond, "@uri");
2092 est_cond_set_order(cond, "@title");
2093 res = est_db_search(db, cond, &rnum, NULL);
2094 if(rnum != 1){
2095 printferror("%s: the number of result is invalid", dbname);
2096 err = TRUE;
2097 }
2098 free(res);
2099 est_cond_delete(cond);
2100 cond = est_cond_new();
2101 est_cond_set_phrase(cond, "one | \xe3\x80\x82 | check & check it ! hogehoge");
2102 est_cond_add_attr(cond, "@uri STRBW file://");
2103 est_cond_set_order(cond, "@title STRD");
2104 est_cond_set_options(cond, ESTCONDSURE | ESTCONDNOIDF | ESTCONDSIMPLE);
2105 res = est_db_search(db, cond, &rnum, NULL);
2106 if(rnum != 4){
2107 printferror("%s: the number of result is invalid", dbname);
2108 err = TRUE;
2109 }
2110 free(res);
2111 est_cond_delete(cond);
2112 }
2113 if(!err){
2114 printfinfo("# checking optimization");
2115 if(!est_db_optimize(db, 0)) err = TRUE;
2116 cond = est_cond_new();
2117 est_cond_set_phrase(cond, "check");
2118 res = est_db_search(db, cond, &rnum, NULL);
2119 if(rnum != 4){
2120 printferror("%s: the number of result is invalid", dbname);
2121 err = TRUE;
2122 }
2123 free(res);
2124 est_cond_delete(cond);
2125 }
2126 if(!err){
2127 printfinfo("# checking traversal access");
2128 cond = est_cond_new();
2129 est_cond_set_phrase(cond, "[UVSET]");
2130 res = est_db_search(db, cond, &rnum, NULL);
2131 for(i = 0; i < rnum; i++){
2132 if(!(doc = est_db_get_doc(db, res[i], 0))){
2133 printferror("%s: a document cannot be retrieved", dbname);
2134 err = TRUE;
2135 break;
2136 }
2137 est_doc_delete(doc);
2138 }
2139 free(res);
2140 est_cond_delete(cond);
2141 }
2142 if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
2143 printfinfo("# closing the database");
2144 if(!est_db_close(db, &ecode)){
2145 printferror("%s: %s", dbname, est_err_msg(ecode));
2146 return 1;
2147 }
2148 curtime = time(NULL) - curtime;
2149 if(!err) printfinfo("# finished successfully: elapsed time: %dh %dm %ds",
2150 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
2151 return err ? 1 : 0;
2152 }
2153
2154
2155 /* output escaped string */
2156 static void xmlprintf(const char *format, ...){
2157 va_list ap;
2158 char *tmp, cbuf[32];
2159 unsigned char c;
2160 int cblen;
2161 va_start(ap, format);
2162 while(*format != '\0'){
2163 if(*format == '%'){
2164 cbuf[0] = '%';
2165 cblen = 1;
2166 format++;
2167 while(strchr("0123456789 .+-", *format) && *format != '\0' && cblen < 31){
2168 cbuf[cblen++] = *format;
2169 format++;
2170 }
2171 cbuf[cblen++] = *format;
2172 cbuf[cblen] = '\0';
2173 switch(*format){
2174 case 's':
2175 tmp = va_arg(ap, char *);
2176 if(!tmp) tmp = "(null)";
2177 printf(cbuf, tmp);
2178 break;
2179 case 'd':
2180 printf(cbuf, va_arg(ap, int));
2181 break;
2182 case 'o': case 'u': case 'x': case 'X': case 'c':
2183 printf(cbuf, va_arg(ap, unsigned int));
2184 break;
2185 case 'e': case 'E': case 'f': case 'g': case 'G':
2186 printf(cbuf, va_arg(ap, double));
2187 break;
2188 case '@':
2189 tmp = va_arg(ap, char *);
2190 if(!tmp) tmp = "(null)";
2191 while(*tmp){
2192 switch(*tmp){
2193 case '&': printf("&amp;"); break;
2194 case '<': printf("&lt;"); break;
2195 case '>': printf("&gt;"); break;
2196 case '"': printf("&quot;"); break;
2197 default:
2198 if(!((*tmp >= 0 && *tmp <= 0x8) || (*tmp >= 0x0e && *tmp <= 0x1f))) putchar(*tmp);
2199 break;
2200 }
2201 tmp++;
2202 }
2203 break;
2204 case '?':
2205 tmp = va_arg(ap, char *);
2206 if(!tmp) tmp = "(null)";
2207 while(*tmp){
2208 c = *(unsigned char *)tmp;
2209 if((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
2210 (c >= '0' && c <= '9') || (c != '\0' && strchr("_-.", c))){
2211 putchar(c);
2212 } else {
2213 printf("%%%02X", c);
2214 }
2215 tmp++;
2216 }
2217 break;
2218 case '%':
2219 putchar('%');
2220 break;
2221 }
2222 } else {
2223 putchar(*format);
2224 }
2225 format++;
2226 }
2227 va_end(ap);
2228 }
2229
2230
2231 /* get the language value */
2232 static int strtolang(const char *str){
2233 if(!cbstricmp(str, "en")) return ESTLANGEN;
2234 if(!cbstricmp(str, "ja")) return ESTLANGJA;
2235 if(!cbstricmp(str, "zh")) return ESTLANGZH;
2236 if(!cbstricmp(str, "ko")) return ESTLANGKO;
2237 return ESTLANGMISC;
2238 }
2239
2240
2241 /* read a line */
2242 static char *fgetl(FILE *ifp){
2243 char *buf;
2244 int c, len, blen;
2245 buf = NULL;
2246 len = 0;
2247 blen = 1024;
2248 while((c = fgetc(ifp)) != EOF){
2249 if(blen <= len) blen *= 2;
2250 buf = cbrealloc(buf, blen + 1);
2251 if(c == '\n') c = '\0';
2252 if(c != '\r') buf[len++] = c;
2253 if(c == '\0') break;
2254 }
2255 if(!buf) return NULL;
2256 buf[len] = '\0';
2257 return buf;
2258 }
2259
2260
2261 /* register a document */
2262 static int doputdoc(ESTDB *db, const char *path, const CBLIST *attrs){
2263 ESTDOC *doc, *edoc;
2264 const char *uri, *vbuf, *xcmd;
2265 char *dbuf, *tbuf;
2266 int i, err, fmt, id, dsiz;
2267 time_t emdate, fmdate;
2268 struct stat sbuf;
2269 xcmd = NULL;
2270 if(cbmaprnum(g_xcmdmap) > 0){
2271 cbmapiterinit(g_xcmdmap);
2272 while((vbuf = cbmapiternext(g_xcmdmap, NULL)) != NULL){
2273 if(cbstrbwimatch(path, vbuf)){
2274 xcmd = cbmapget(g_xcmdmap, vbuf, -1, NULL);
2275 break;
2276 }
2277 }
2278 }
2279 fmt = g_filefmt;
2280 if(g_filefmt == FF_NONE && !xcmd) return TRUE;
2281 if(g_filefmt == FF_AUTO){
2282 if(cbstrbwimatch(path, ESTEXTSTR "est")){
2283 fmt = FF_DRAFT;
2284 } else if(cbstrbwimatch(path, ESTEXTSTR "txt") || cbstrbwimatch(path, ESTEXTSTR "text") ||
2285 cbstrbwimatch(path, ESTEXTSTR "asc")){
2286 fmt = FF_TEXT;
2287 } else if(cbstrbwimatch(path, ESTEXTSTR "html") || cbstrbwimatch(path, ESTEXTSTR "htm") ||
2288 cbstrbwimatch(path, ESTEXTSTR "xhtml") || cbstrbwimatch(path, ESTEXTSTR "xht")){
2289 fmt = FF_HTML;
2290 } else if(cbstrbwimatch(path, ESTEXTSTR "eml") || cbstrbwimatch(path, ESTEXTSTR "mime") ||
2291 cbstrbwimatch(path, ESTEXTSTR "mht") || cbstrbwimatch(path, ESTEXTSTR "mhtml")){
2292 fmt = FF_MIME;
2293 } else if(!xcmd){
2294 return TRUE;
2295 }
2296 }
2297 if(stat(path, &sbuf) == -1 || !S_ISREG(sbuf.st_mode) || !(uri = pathtourl(path))){
2298 printferror("%s: could not open", path);
2299 return TRUE;
2300 }
2301 emdate = -1;
2302 if(g_chkmdate && (id = est_db_uri_to_id(db, uri)) > 0 &&
2303 (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2304 if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2305 est_doc_delete(edoc);
2306 }
2307 if(g_stdate && emdate >= 0 && emdate >= sbuf.st_mtime){
2308 printfinfo("%s: passed", path);
2309 return TRUE;
2310 }
2311 if(g_filtorig){
2312 dbuf = cbmemdup("", 0);
2313 dsiz = 0;
2314 } else {
2315 if(!(dbuf = cbreadfile(path, &dsiz))){
2316 printferror("%s: could not open", path);
2317 return TRUE;
2318 }
2319 }
2320 if(xcmd){
2321 doc = est_doc_new_with_xcmd(dbuf, dsiz, path, xcmd, est_db_name(db),
2322 g_inputcode, g_inputlang);
2323 } else {
2324 switch(fmt){
2325 case FF_TEXT:
2326 doc = est_doc_new_from_text(dbuf, dsiz, g_inputcode, g_inputlang);
2327 break;
2328 case FF_HTML:
2329 doc = est_doc_new_from_html(dbuf, dsiz, g_inputcode, g_inputlang);
2330 break;
2331 case FF_MIME:
2332 doc = est_doc_new_from_mime(dbuf, dsiz, g_inputcode, g_inputlang);
2333 break;
2334 default:
2335 doc = est_doc_new_from_draft_enc(dbuf, dsiz, g_inputcode);
2336 break;
2337 }
2338 }
2339 if(attrs){
2340 for(i = 0; i < cblistnum(g_pathattrs) && i < cblistnum(attrs); i++){
2341 est_doc_add_attr(doc, cblistval(g_pathattrs, i, NULL), cblistval(attrs, i, NULL));
2342 }
2343 }
2344 if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri);
2345 est_doc_add_attr(doc, DATTRLPATH, uri);
2346 est_doc_add_attr(doc, DATTRLFILE, urltofile(uri));
2347 uri = est_doc_attr(doc, ESTDATTRURI);
2348 if(g_stdate){
2349 tbuf = cbdatestrwww(sbuf.st_ctime, 0);
2350 est_doc_add_attr(doc, ESTDATTRCDATE, tbuf);
2351 free(tbuf);
2352 tbuf = cbdatestrwww(sbuf.st_mtime, 0);
2353 est_doc_add_attr(doc, ESTDATTRMDATE, tbuf);
2354 free(tbuf);
2355 }
2356 if(g_chkmdate && emdate == -1 && (id = est_db_uri_to_id(db, uri)) > 0 &&
2357 (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2358 if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2359 est_doc_delete(edoc);
2360 }
2361 fmdate = -1;
2362 if(g_chkmdate && (vbuf = est_doc_attr(doc, ESTDATTRMDATE)) != NULL) fmdate = cbstrmktime(vbuf);
2363 err = FALSE;
2364 if(emdate >= 0 && emdate >= fmdate){
2365 printfinfo("%s: passed", path);
2366 } else if(est_db_put_doc(db, doc, g_putopts)){
2367 printfinfo("%d (%s): registered", est_doc_id(doc), uri);
2368 } else {
2369 printferror("%s: %s", est_db_name(db), est_err_msg(est_db_error(db)));
2370 err = TRUE;
2371 }
2372 est_doc_delete(doc);
2373 free(dbuf);
2374 return err ? FALSE : TRUE;
2375 }
2376
2377
2378 /* get the URL of a path */
2379 static const char *pathtourl(const char *path){
2380 static char pbuf[URIBUFSIZ];
2381 const char *elem;
2382 char *wp, *ebuf;
2383 CBLIST *list;
2384 int i, esiz;
2385 if(strlen(path) >= URIBUFSIZ / 4) return NULL;
2386 if(g_pathcode){
2387 wp = est_realpath(path);
2388 if(!(ebuf = est_iconv(wp, -1, g_pathcode, "UTF-8", &esiz, NULL))){
2389 esiz = strlen(wp);
2390 ebuf = cbmemdup(wp, esiz);
2391 }
2392 list = cbsplit(ebuf, esiz, ESTPATHSTR);
2393 free(ebuf);
2394 free(wp);
2395 for(i = 0; i < cblistnum(list); i++){
2396 elem = cblistval(list, i, &esiz);
2397 if((ebuf = est_iconv(elem, esiz, "UTF-8", g_pathcode, &esiz, NULL)) != NULL){
2398 cblistover(list, i, ebuf, esiz);
2399 free(ebuf);
2400 }
2401 }
2402 } else {
2403 wp = est_realpath(path);
2404 list = cbsplit(wp, -1, ESTPATHSTR);
2405 free(wp);
2406 }
2407 wp = pbuf;
2408 wp += sprintf(wp, "file://");
2409 for(i = 0; i < cblistnum(list); i++){
2410 elem = cblistval(list, i, NULL);
2411 if(elem[0] == '\0') continue;
2412 if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2413 elem[1] == ':'){
2414 wp += sprintf(wp, "%c|", elem[0]);
2415 continue;
2416 }
2417 ebuf = cburlencode(elem, -1);
2418 wp += sprintf(wp, "/%s", ebuf);
2419 free(ebuf);
2420 }
2421 *wp = '\0';
2422 cblistclose(list);
2423 return pbuf;
2424 }
2425
2426
2427 /* get the file name of a URL */
2428 static const char *urltofile(const char *uri){
2429 static char pbuf[URIBUFSIZ];
2430 const char *rp;
2431 char *dbuf, *ebuf;
2432 int dsiz;
2433 if(g_pathfull){
2434 if((rp = strstr(uri, "//")) != NULL){
2435 rp += 2;
2436 if(((rp[0] >= 'A' && rp[0] <= 'Z') || (rp[0] >= 'a' && rp[0] <= 'z')) &&
2437 rp[1] == '|' && rp[2] == '/') rp += 2;
2438 } else {
2439 rp = uri;
2440 }
2441 } else if((rp = strrchr(uri, '/')) != NULL){
2442 rp++;
2443 } else {
2444 rp = uri;
2445 }
2446 dbuf = cburldecode(rp, &dsiz);
2447 if((ebuf = est_iconv(dbuf, dsiz, g_pathcode ? g_pathcode : "ISO-8859-1", "UTF-8", NULL, NULL))
2448 != NULL){
2449 sprintf(pbuf, "%s", ebuf);
2450 free(ebuf);
2451 } else {
2452 sprintf(pbuf, "%s", rp);
2453 }
2454 free(dbuf);
2455 return pbuf;
2456 }
2457
2458
2459 /* geth the local path of a URL */
2460 static char *urltopath(const char *uri){
2461 static char pbuf[URIBUFSIZ];
2462 const char *elem;
2463 char *wp, *dbuf;
2464 CBLIST *list;
2465 int i;
2466 if(!cbstrfwimatch(uri, "file://")) return NULL;
2467 if(!(uri = strchr(uri + 7, '/'))) return NULL;
2468 list = cbsplit(uri, -1, "/");
2469 wp = pbuf;
2470 for(i = 0; i < cblistnum(list); i++){
2471 elem = cblistval(list, i, NULL);
2472 if(elem[0] == '\0') continue;
2473 if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2474 elem[1] == '|'){
2475 wp += sprintf(wp, "%c:", elem[0]);
2476 continue;
2477 }
2478 dbuf = cburldecode(elem, NULL);
2479 wp += sprintf(wp, "%c%s", ESTPATHCHR, dbuf);
2480 free(dbuf);
2481 }
2482 *wp = '\0';
2483 cblistclose(list);
2484 return pbuf;
2485 }
2486
2487
2488 /* create a vector of keywords */
2489 static CBMAP *vectorizer(void *db, int id, void *kwdb){
2490 CBMAP *kwords;
2491 char *mbuf;
2492 int msiz;
2493 if(!(mbuf = crget((CURIA *)kwdb, (char *)&id, sizeof(int), 0, -1, &msiz))) return NULL;
2494 kwords = cbmapload(mbuf, msiz);
2495 free(mbuf);
2496 return kwords;
2497 }
2498
2499
2500 /* create a document object with an outer command */
2501 static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
2502 const char *xcmd, const char *tmpdir,
2503 const char *penc, int plang){
2504 ESTDOC *doc;
2505 const char *pv, *ext;
2506 char iname[URIBUFSIZ], oname[URIBUFSIZ], ebuf[URIBUFSIZ], cmd[URIBUFSIZ];
2507 char *rbuf, numbuf[NUMBUFSIZ];
2508 int fmt, rsiz;
2509 assert(buf && size >= 0 && path && xcmd && tmpdir);
2510 sprintf(ebuf, "ESTORIGFILE=%s", path);
2511 ext = NULL;
2512 if((pv = strrchr(path, ESTPATHCHR)) != NULL) path = pv;
2513 if((pv = strrchr(path, ESTEXTCHR)) != NULL) ext = pv;
2514 if(!ext) ext = "";
2515 sprintf(iname, "%s%cxcmd-in-%08d%s", tmpdir, ESTPATHCHR, getpid(), ext);
2516 sprintf(oname, "%s%cxcmd-out-%08d%cest", tmpdir, ESTPATHCHR, getpid(), ESTEXTCHR);
2517 fmt = FF_DRAFT;
2518 if(cbstrfwmatch(xcmd, "T@")){
2519 fmt = FF_TEXT;
2520 xcmd += 2;
2521 } else if(cbstrfwmatch(xcmd, "H@")){
2522 fmt = FF_HTML;
2523 xcmd += 2;
2524 } else if(cbstrfwmatch(xcmd, "M@")){
2525 fmt = FF_MIME;
2526 xcmd += 2;
2527 }
2528 sprintf(cmd, "%s %s %s", xcmd, iname, oname);
2529 if(!g_filtorig) cbwritefile(iname, buf, size);
2530 putenv(ebuf);
2531 system(cmd);
2532 if((rbuf = cbreadfile(oname, &rsiz)) != NULL){
2533 switch(fmt){
2534 case FF_TEXT:
2535 doc = est_doc_new_from_text(rbuf, rsiz, penc, plang);
2536 break;
2537 case FF_HTML:
2538 doc = est_doc_new_from_html(rbuf, rsiz, penc, plang);
2539 break;
2540 case FF_MIME:
2541 doc = est_doc_new_from_mime(rbuf, rsiz, penc, plang);
2542 break;
2543 default:
2544 doc = est_doc_new_from_draft_enc(rbuf, rsiz, penc);
2545 break;
2546 }
2547 free(rbuf);
2548 } else {
2549 doc = est_doc_new();
2550 }
2551 if(fmt != FF_DRAFT){
2552 sprintf(numbuf, "%d", size);
2553 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2554 est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext));
2555 }
2556 unlink(oname);
2557 unlink(iname);
2558 return doc;
2559 }
2560
2561
2562 /* create a document object from draft data in another encoding */
2563 static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc){
2564 ESTDOC *doc;
2565 char *rbuf;
2566 assert(buf);
2567 if(enc && (rbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL)) != NULL){
2568 doc = est_doc_new_from_draft(rbuf);
2569 free(rbuf);
2570 } else {
2571 doc = est_doc_new_from_draft(buf);
2572 }
2573 return doc;
2574 }
2575
2576
2577 /* create a document object from plain text */
2578 static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang){
2579 ESTDOC *doc;
2580 CBLIST *lines;
2581 CBDATUM *datum;
2582 const char *enc, *text, *line;
2583 char *nbuf, numbuf[NUMBUFSIZ];
2584 int i;
2585 assert(buf);
2586 doc = est_doc_new();
2587 enc = penc ? penc : est_enc_name(buf, size, plang);
2588 if(!strcmp(enc, "UTF-8")){
2589 nbuf = NULL;
2590 text = buf;
2591 } else {
2592 text = buf;
2593 nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2594 if(nbuf) text = nbuf;
2595 }
2596 lines = cbsplit(text, -1, "\n");
2597 datum = cbdatumopen("", 0);
2598 for(i = 0; i < CB_LISTNUM(lines); i++){
2599 line = CB_LISTVAL(lines, i, NULL);
2600 while(*line == ' ' || *line == '\t' || *line == '\r'){
2601 line++;
2602 }
2603 if(line[0] == '\0'){
2604 est_doc_add_text(doc, CB_DATUMPTR(datum));
2605 cbdatumsetsize(datum, 0);
2606 } else {
2607 cbdatumcat(datum, " ", 1);
2608 cbdatumcat(datum, line, -1);
2609 }
2610 }
2611 est_doc_add_text(doc, CB_DATUMPTR(datum));
2612 cbdatumclose(datum);
2613 cblistclose(lines);
2614 est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain");
2615 sprintf(numbuf, "%d", size);
2616 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2617 if(nbuf) free(nbuf);
2618 return doc;
2619 }
2620
2621
2622 /* create a document object from HTML */
2623 static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang){
2624 ESTDOC *doc;
2625 CBLIST *elems;
2626 CBMAP *attrs;
2627 CBDATUM *datum;
2628 const char *enc, *html, *elem, *next, *name, *content;
2629 char *nbuf, *nenc, *rbuf, *lbuf, numbuf[NUMBUFSIZ];
2630 int i, esiz;
2631 assert(buf);
2632 doc = est_doc_new();
2633 enc = est_enc_name(buf, size, plang);
2634 html = NULL;
2635 nbuf = NULL;
2636 if(!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")){
2637 nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2638 } else if(!strcmp(enc, "US-ASCII")){
2639 nbuf = NULL;
2640 } else {
2641 if((nenc = penc ? cbmemdup(penc, -1) : est_html_enc(buf)) != NULL){
2642 if(cbstricmp(nenc, "UTF-8")){
2643 nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL);
2644 if(!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2645 }
2646 free(nenc);
2647 } else {
2648 nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2649 }
2650 }
2651 if(nbuf) html = nbuf;
2652 if(!html) html = buf;
2653 datum = cbdatumopen("", 0);
2654 elems = cbxmlbreak(html, TRUE);
2655 for(i = 0; i < CB_LISTNUM(elems); i++){
2656 elem = CB_LISTVAL2(elems, i, &esiz);
2657 if(!(next = cblistval(elems, i + 1, NULL))) next = "";
2658 if(elem[0] == '<'){
2659 if(cbstrfwimatch(elem, "<meta")){
2660 attrs = cbxmlattrs(elem);
2661 name = cbmapget(attrs, "name", -1, NULL);
2662 if(!name) name = cbmapget(attrs, "Name", -1, NULL);
2663 if(!name) name = cbmapget(attrs, "NAME", -1, NULL);
2664 if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL);
2665 if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL);
2666 if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL);
2667 if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2668 content = cbmapget(attrs, "content", -1, NULL);
2669 if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2670 if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2671 if(name && content){
2672 lbuf = cbmemdup(name, -1);
2673 cbstrtolower(lbuf);
2674 cbstrsqzspc(lbuf);
2675 if(!strcmp(lbuf, "author")){
2676 if(strchr(content, '&')){
2677 rbuf = est_html_raw_text(content);
2678 est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
2679 free(rbuf);
2680 } else {
2681 est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
2682 }
2683 }
2684 if(name[0] != '@'){
2685 if(strchr(content, '&')){
2686 rbuf = est_html_raw_text(content);
2687 est_doc_add_attr(doc, lbuf, rbuf);
2688 free(rbuf);
2689 } else {
2690 est_doc_add_attr(doc, lbuf, content);
2691 }
2692 }
2693 free(lbuf);
2694 }
2695 cbmapclose(attrs);
2696 } else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){
2697 if(strchr(next, '&')){
2698 rbuf = est_html_raw_text(next);
2699 est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
2700 est_doc_add_hidden_text(doc, rbuf);
2701 free(rbuf);
2702 } else {
2703 est_doc_add_attr(doc, ESTDATTRTITLE, next);
2704 est_doc_add_hidden_text(doc, next);
2705 }
2706 i++;
2707 } else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){
2708 i++;
2709 } else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") ||
2710 cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") ||
2711 cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") ||
2712 cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") ||
2713 cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") ||
2714 cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") ||
2715 cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") ||
2716 cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") ||
2717 cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") ||
2718 cbstrfwimatch(elem, "<pre")){
2719 if(strchr(CB_DATUMPTR(datum), '&')){
2720 rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2721 est_doc_add_text(doc, rbuf);
2722 free(rbuf);
2723 } else {
2724 est_doc_add_text(doc, CB_DATUMPTR(datum));
2725 }
2726 cbdatumsetsize(datum, 0);
2727 }
2728 } else {
2729 cbdatumcat(datum, " ", -1);
2730 cbdatumcat(datum, elem, esiz);
2731 }
2732 }
2733 cblistclose(elems);
2734 if(strchr(CB_DATUMPTR(datum), '&')){
2735 rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2736 est_doc_add_text(doc, rbuf);
2737 free(rbuf);
2738 } else {
2739 est_doc_add_text(doc, CB_DATUMPTR(datum));
2740 }
2741 cbdatumclose(datum);
2742 if(nbuf) free(nbuf);
2743 est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
2744 sprintf(numbuf, "%d", size);
2745 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2746 return doc;
2747 }
2748
2749
2750 /* get the encoding of an HTML string */
2751 static char *est_html_enc(const char *str){
2752 CBLIST *elems;
2753 CBMAP *attrs;
2754 const char *elem, *equiv, *content;
2755 char *enc, *pv;
2756 int i;
2757 assert(str);
2758 elems = cbxmlbreak(str, TRUE);
2759 for(i = 0; i < CB_LISTNUM(elems); i++){
2760 elem = CB_LISTVAL(elems, i, NULL);
2761 if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue;
2762 enc = NULL;
2763 attrs = cbxmlattrs(elem);
2764 equiv = cbmapget(attrs, "http-equiv", -1, NULL);
2765 if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2766 if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL);
2767 if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL);
2768 if(equiv && !cbstricmp(equiv, "Content-Type")){
2769 content = cbmapget(attrs, "content", -1, NULL);
2770 if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2771 if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2772 if(content && ((pv = strstr(content, "charset")) != NULL ||
2773 (pv = strstr(content, "Charset")) != NULL ||
2774 (pv = strstr(content, "CHARSET")) != NULL)){
2775 enc = cbmemdup(pv + 8, -1);
2776 if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL ||
2777 (pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0';
2778 }
2779 }
2780 cbmapclose(attrs);
2781 if(enc){
2782 cblistclose(elems);
2783 return enc;
2784 }
2785 }
2786 cblistclose(elems);
2787 return NULL;
2788 }
2789
2790
2791 /* unescape entity references of HTML */
2792 static char *est_html_raw_text(const char *html){
2793 static const char *pairs[] = {
2794 /* basic symbols */
2795 "&amp;", "&", "&lt;", "<", "&gt;", ">", "&quot;", "\"", "&apos;", "'",
2796 /* ISO-8859-1 */
2797 "&nbsp;", "\xc2\xa0", "&iexcl;", "\xc2\xa1", "&cent;", "\xc2\xa2",
2798 "&pound;", "\xc2\xa3", "&curren;", "\xc2\xa4", "&yen;", "\xc2\xa5",
2799 "&brvbar;", "\xc2\xa6", "&sect;", "\xc2\xa7", "&uml;", "\xc2\xa8",
2800 "&copy;", "\xc2\xa9", "&ordf;", "\xc2\xaa", "&laquo;", "\xc2\xab",
2801 "&not;", "\xc2\xac", "&shy;", "\xc2\xad", "&reg;", "\xc2\xae",
2802 "&macr;", "\xc2\xaf", "&deg;", "\xc2\xb0", "&plusmn;", "\xc2\xb1",
2803 "&sup2;", "\xc2\xb2", "&sup3;", "\xc2\xb3", "&acute;", "\xc2\xb4",
2804 "&micro;", "\xc2\xb5", "&para;", "\xc2\xb6", "&middot;", "\xc2\xb7",
2805 "&cedil;", "\xc2\xb8", "&sup1;", "\xc2\xb9", "&ordm;", "\xc2\xba",
2806 "&raquo;", "\xc2\xbb", "&frac14;", "\xc2\xbc", "&frac12;", "\xc2\xbd",
2807 "&frac34;", "\xc2\xbe", "&iquest;", "\xc2\xbf", "&Agrave;", "\xc3\x80",
2808 "&Aacute;", "\xc3\x81", "&Acirc;", "\xc3\x82", "&Atilde;", "\xc3\x83",
2809 "&Auml;", "\xc3\x84", "&Aring;", "\xc3\x85", "&AElig;", "\xc3\x86",
2810 "&Ccedil;", "\xc3\x87", "&Egrave;", "\xc3\x88", "&Eacute;", "\xc3\x89",
2811 "&Ecirc;", "\xc3\x8a", "&Euml;", "\xc3\x8b", "&Igrave;", "\xc3\x8c",
2812 "&Iacute;", "\xc3\x8d", "&Icirc;", "\xc3\x8e", "&Iuml;", "\xc3\x8f",
2813 "&ETH;", "\xc3\x90", "&Ntilde;", "\xc3\x91", "&Ograve;", "\xc3\x92",
2814 "&Oacute;", "\xc3\x93", "&Ocirc;", "\xc3\x94", "&Otilde;", "\xc3\x95",
2815 "&Ouml;", "\xc3\x96", "&times;", "\xc3\x97", "&Oslash;", "\xc3\x98",
2816 "&Ugrave;", "\xc3\x99", "&Uacute;", "\xc3\x9a", "&Ucirc;", "\xc3\x9b",
2817 "&Uuml;", "\xc3\x9c", "&Yacute;", "\xc3\x9d", "&THORN;", "\xc3\x9e",
2818 "&szlig;", "\xc3\x9f", "&agrave;", "\xc3\xa0", "&aacute;", "\xc3\xa1",
2819 "&acirc;", "\xc3\xa2", "&atilde;", "\xc3\xa3", "&auml;", "\xc3\xa4",
2820 "&aring;", "\xc3\xa5", "&aelig;", "\xc3\xa6", "&ccedil;", "\xc3\xa7",
2821 "&egrave;", "\xc3\xa8", "&eacute;", "\xc3\xa9", "&ecirc;", "\xc3\xaa",
2822 "&euml;", "\xc3\xab", "&igrave;", "\xc3\xac", "&iacute;", "\xc3\xad",
2823 "&icirc;", "\xc3\xae", "&iuml;", "\xc3\xaf", "&eth;", "\xc3\xb0",
2824 "&ntilde;", "\xc3\xb1", "&ograve;", "\xc3\xb2", "&oacute;", "\xc3\xb3",
2825 "&ocirc;", "\xc3\xb4", "&otilde;", "\xc3\xb5", "&ouml;", "\xc3\xb6",
2826 "&divide;", "\xc3\xb7", "&oslash;", "\xc3\xb8", "&ugrave;", "\xc3\xb9",
2827 "&uacute;", "\xc3\xba", "&ucirc;", "\xc3\xbb", "&uuml;", "\xc3\xbc",
2828 "&yacute;", "\xc3\xbd", "&thorn;", "\xc3\xbe", "&yuml;", "\xc3\xbf",
2829 /* ISO-10646 */
2830 "&fnof;", "\xc6\x92", "&Alpha;", "\xce\x91", "&Beta;", "\xce\x92",
2831 "&Gamma;", "\xce\x93", "&Delta;", "\xce\x94", "&Epsilon;", "\xce\x95",
2832 "&Zeta;", "\xce\x96", "&Eta;", "\xce\x97", "&Theta;", "\xce\x98",
2833 "&Iota;", "\xce\x99", "&Kappa;", "\xce\x9a", "&Lambda;", "\xce\x9b",
2834 "&Mu;", "\xce\x9c", "&Nu;", "\xce\x9d", "&Xi;", "\xce\x9e",
2835 "&Omicron;", "\xce\x9f", "&Pi;", "\xce\xa0", "&Rho;", "\xce\xa1",
2836 "&Sigma;", "\xce\xa3", "&Tau;", "\xce\xa4", "&Upsilon;", "\xce\xa5",
2837 "&Phi;", "\xce\xa6", "&Chi;", "\xce\xa7", "&Psi;", "\xce\xa8",
2838 "&Omega;", "\xce\xa9", "&alpha;", "\xce\xb1", "&beta;", "\xce\xb2",
2839 "&gamma;", "\xce\xb3", "&delta;", "\xce\xb4", "&epsilon;", "\xce\xb5",
2840 "&zeta;", "\xce\xb6", "&eta;", "\xce\xb7", "&theta;", "\xce\xb8",
2841 "&iota;", "\xce\xb9", "&kappa;", "\xce\xba", "&lambda;", "\xce\xbb",
2842 "&mu;", "\xce\xbc", "&nu;", "\xce\xbd", "&xi;", "\xce\xbe",
2843 "&omicron;", "\xce\xbf", "&pi;", "\xcf\x80", "&rho;", "\xcf\x81",
2844 "&sigmaf;", "\xcf\x82", "&sigma;", "\xcf\x83", "&tau;", "\xcf\x84",
2845 "&upsilon;", "\xcf\x85", "&phi;", "\xcf\x86", "&chi;", "\xcf\x87",
2846 "&psi;", "\xcf\x88", "&omega;", "\xcf\x89", "&thetasym;", "\xcf\x91",
2847 "&upsih;", "\xcf\x92", "&piv;", "\xcf\x96", "&bull;", "\xe2\x80\xa2",
2848 "&hellip;", "\xe2\x80\xa6", "&prime;", "\xe2\x80\xb2", "&Prime;", "\xe2\x80\xb3",
2849 "&oline;", "\xe2\x80\xbe", "&frasl;", "\xe2\x81\x84", "&weierp;", "\xe2\x84\x98",
2850 "&image;", "\xe2\x84\x91", "&real;", "\xe2\x84\x9c", "&trade;", "\xe2\x84\xa2",
2851 "&alefsym;", "\xe2\x84\xb5", "&larr;", "\xe2\x86\x90", "&uarr;", "\xe2\x86\x91",
2852 "&rarr;", "\xe2\x86\x92", "&darr;", "\xe2\x86\x93", "&harr;", "\xe2\x86\x94",
2853 "&crarr;", "\xe2\x86\xb5", "&lArr;", "\xe2\x87\x90", "&uArr;", "\xe2\x87\x91",
2854 "&rArr;", "\xe2\x87\x92", "&dArr;", "\xe2\x87\x93", "&hArr;", "\xe2\x87\x94",
2855 "&forall;", "\xe2\x88\x80", "&part;", "\xe2\x88\x82", "&exist;", "\xe2\x88\x83",
2856 "&empty;", "\xe2\x88\x85", "&nabla;", "\xe2\x88\x87", "&isin;", "\xe2\x88\x88",
2857 "&notin;", "\xe2\x88\x89", "&ni;", "\xe2\x88\x8b", "&prod;", "\xe2\x88\x8f",
2858 "&sum;", "\xe2\x88\x91", "&minus;", "\xe2\x88\x92", "&lowast;", "\xe2\x88\x97",
2859 "&radic;", "\xe2\x88\x9a", "&prop;", "\xe2\x88\x9d", "&infin;", "\xe2\x88\x9e",
2860 "&ang;", "\xe2\x88\xa0", "&and;", "\xe2\x88\xa7", "&or;", "\xe2\x88\xa8",
2861 "&cap;", "\xe2\x88\xa9", "&cup;", "\xe2\x88\xaa", "&int;", "\xe2\x88\xab",
2862 "&there4;", "\xe2\x88\xb4", "&sim;", "\xe2\x88\xbc", "&cong;", "\xe2\x89\x85",
2863 "&asymp;", "\xe2\x89\x88", "&ne;", "\xe2\x89\xa0", "&equiv;", "\xe2\x89\xa1",
2864 "&le;", "\xe2\x89\xa4", "&ge;", "\xe2\x89\xa5", "&sub;", "\xe2\x8a\x82",
2865 "&sup;", "\xe2\x8a\x83", "&nsub;", "\xe2\x8a\x84", "&sube;", "\xe2\x8a\x86",
2866 "&supe;", "\xe2\x8a\x87", "&oplus;", "\xe2\x8a\x95", "&otimes;", "\xe2\x8a\x97",
2867 "&perp;", "\xe2\x8a\xa5", "&sdot;", "\xe2\x8b\x85", "&lceil;", "\xe2\x8c\x88",
2868 "&rceil;", "\xe2\x8c\x89", "&lfloor;", "\xe2\x8c\x8a", "&rfloor;", "\xe2\x8c\x8b",
2869 "&lang;", "\xe2\x8c\xa9", "&rang;", "\xe2\x8c\xaa", "&loz;", "\xe2\x97\x8a",
2870 "&spades;", "\xe2\x99\xa0", "&clubs;", "\xe2\x99\xa3", "&hearts;", "\xe2\x99\xa5",
2871 "&diams;", "\xe2\x99\xa6", "&OElig;", "\xc5\x92", "&oelig;", "\xc5\x93",
2872 "&Scaron;", "\xc5\xa0", "&scaron;", "\xc5\xa1", "&Yuml;", "\xc5\xb8",
2873 "&circ;", "\xcb\x86", "&tilde;", "\xcb\x9c", "&ensp;", "\xe2\x80\x82",
2874 "&emsp;", "\xe2\x80\x83", "&thinsp;", "\xe2\x80\x89", "&zwnj;", "\xe2\x80\x8c",
2875 "&zwj;", "\xe2\x80\x8d", "&lrm;", "\xe2\x80\x8e", "&rlm;", "\xe2\x80\x8f",
2876 "&ndash;", "\xe2\x80\x93", "&mdash;", "\xe2\x80\x94", "&lsquo;", "\xe2\x80\x98",
2877 "&rsquo;", "\xe2\x80\x99", "&sbquo;", "\xe2\x80\x9a", "&ldquo;", "\xe2\x80\x9c",
2878 "&rdquo;", "\xe2\x80\x9d", "&bdquo;", "\xe2\x80\x9e", "&dagger;", "\xe2\x80\xa0",
2879 "&Dagger;", "\xe2\x80\xa1", "&permil;", "\xe2\x80\xb0", "&lsaquo;", "\xe2\x80\xb9",
2880 "&rsaquo;", "\xe2\x80\xba", "&euro;", "\xe2\x82\xac",
2881 NULL
2882 };
2883 char *raw, *wp, buf[2], *tmp;
2884 int i, j, hit, num, tsiz;
2885 assert(html);
2886 CB_MALLOC(raw, strlen(html) * 3 + 1);
2887 wp = raw;
2888 while(*html != '\0'){
2889 if(*html == '&'){
2890 if(*(html + 1) == '#'){
2891 if(*(html + 2) == 'x' || *(html + 2) == 'X'){
2892 num = strtol(html + 3, NULL, 16);
2893 } else {
2894 num = atoi(html + 2);
2895 }
2896 buf[0] = num / 256;
2897 buf[1] = num % 256;
2898 if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){
2899 for(j = 0; j < tsiz; j++){
2900 *wp = ((unsigned char *)tmp)[j];
2901 wp++;
2902 }
2903 free(tmp);
2904 }
2905 while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
2906 html++;
2907 }
2908 if(*html == ';') html++;
2909 } else {
2910 hit = FALSE;
2911 for(i = 0; pairs[i] != NULL; i += 2){
2912 if(cbstrfwmatch(html, pairs[i])){
2913 wp += sprintf(wp, "%s", pairs[i+1]);
2914 html += strlen(pairs[i]);
2915 hit = TRUE;
2916 break;
2917 }
2918 }
2919 if(!hit){
2920 *wp = *html;
2921 wp++;
2922 html++;
2923 }
2924 }
2925 } else {
2926 *wp = *html;
2927 wp++;
2928 html++;
2929 }
2930 }
2931 *wp = '\0';
2932 return raw;
2933 }
2934
2935
2936 /* create a document object from MIME */
2937 static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang){
2938 ESTDOC *doc, *tdoc;
2939 CBMAP *attrs;
2940 const CBLIST *texts;
2941 CBLIST *parts, *lines;
2942 CBDATUM *datum;
2943 const char *key, *val, *bound, *part, *text, *line;
2944 char *body, *swap, numbuf[NUMBUFSIZ];
2945 int i, j, bsiz, psiz, ssiz, mht;
2946 assert(buf);
2947 doc = est_doc_new();
2948 attrs = cbmapopenex(MINIBNUM);
2949 body = cbmimebreak(buf, size, attrs, &bsiz);
2950 if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){
2951 est_doc_add_attr_mime(doc, ESTDATTRTITLE, val);
2952 if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val);
2953 }
2954 if((val = cbmapget(attrs, "from", -1, NULL)) != NULL)
2955 est_doc_add_attr_mime(doc, ESTDATTRAUTHOR, val);
2956 if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){
2957 est_doc_add_attr_mime(doc, ESTDATTRCDATE, val);
2958 est_doc_add_attr_mime(doc, ESTDATTRMDATE, val);
2959 }
2960 est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822");
2961 sprintf(numbuf, "%d", size);
2962 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2963 cbmapiterinit(attrs);
2964 while((key = cbmapiternext(attrs, NULL)) != NULL){
2965 if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@') continue;
2966 val = cbmapget(attrs, key, -1, NULL);
2967 est_doc_add_attr_mime(doc, key, val);
2968 }
2969 if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){
2970 mht = cbstrfwimatch(key, "multipart/related");
2971 if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){
2972 parts = cbmimeparts(body, bsiz, bound);
2973 for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){
2974 part = CB_LISTVAL2(parts, i, &psiz);
2975 tdoc = est_doc_new_from_mime(part, psiz, penc, plang);
2976 if(mht){
2977 if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL)
2978 est_doc_add_attr(doc, ESTDATTRTITLE, text);
2979 if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL)
2980 est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
2981 }
2982 texts = est_doc_texts(tdoc);
2983 for(j = 0; j < CB_LISTNUM(texts); j++){
2984 text = CB_LISTVAL(texts, j, NULL);
2985 est_doc_add_text(doc, text);
2986 }
2987 est_doc_delete(tdoc);
2988 }
2989 cblistclose(parts);
2990 }
2991 } else {
2992 if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2993 cbstrfwimatch(key, "base64")){
2994 swap = cbbasedecode(body, &ssiz);
2995 free(body);
2996 body = swap;
2997 bsiz = ssiz;
2998 } else if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2999 cbstrfwimatch(key, "quoted-printable")){
3000 swap = cbquotedecode(body, &ssiz);
3001 free(body);
3002 body = swap;
3003 bsiz = ssiz;
3004 }
3005 if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){
3006 if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){
3007 free(body);
3008 body = swap;
3009 bsiz = ssiz;
3010 } else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL &&
3011 (swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){
3012 free(body);
3013 body = swap;
3014 bsiz = ssiz;
3015 }
3016 lines = cbsplit(body, bsiz, "\n");
3017 datum = cbdatumopen("", 0);
3018 for(i = 0; i < CB_LISTNUM(lines); i++){
3019 line = CB_LISTVAL(lines, i, NULL);
3020 while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){
3021 line++;
3022 }
3023 if(line[0] == '\0'){
3024 est_doc_add_text(doc, CB_DATUMPTR(datum));
3025 cbdatumsetsize(datum, 0);
3026 } else {
3027 cbdatumcat(datum, " ", 1);
3028 cbdatumcat(datum, line, -1);
3029 }
3030 }
3031 est_doc_add_text(doc, CB_DATUMPTR(datum));
3032 cbdatumclose(datum);
3033 cblistclose(lines);
3034 } else if(cbstrfwimatch(key, "text/html")){
3035 tdoc = est_doc_new_from_html(body, bsiz, penc, plang);
3036 if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3037 if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3038 est_doc_add_text(doc, text);
3039 }
3040 if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3041 if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3042 est_doc_add_text(doc, text);
3043 }
3044 texts = est_doc_texts(tdoc);
3045 for(i = 0; i < CB_LISTNUM(texts); i++){
3046 text = CB_LISTVAL(texts, i, NULL);
3047 est_doc_add_text(doc, text);
3048 }
3049 est_doc_delete(tdoc);
3050 } else if(cbstrfwimatch(key, "message/rfc822")){
3051 tdoc = est_doc_new_from_mime(body, bsiz, penc, plang);
3052 if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3053 if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3054 est_doc_add_text(doc, text);
3055 }
3056 if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3057 if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3058 est_doc_add_text(doc, text);
3059 }
3060 texts = est_doc_texts(tdoc);
3061 for(i = 0; i < CB_LISTNUM(texts); i++){
3062 text = CB_LISTVAL(texts, i, NULL);
3063 est_doc_add_text(doc, text);
3064 }
3065 est_doc_delete(tdoc);
3066 } else if(cbstrfwimatch(key, "text/")){
3067 tdoc = est_doc_new_from_text(body, bsiz, penc, plang);
3068 texts = est_doc_texts(tdoc);
3069 for(i = 0; i < CB_LISTNUM(texts); i++){
3070 text = CB_LISTVAL(texts, i, NULL);
3071 est_doc_add_text(doc, text);
3072 }
3073 est_doc_delete(tdoc);
3074 }
3075 }
3076 free(body);
3077 cbmapclose(attrs);
3078 return doc;
3079 }
3080
3081
3082 /* set mime value as an attribute of a document */
3083 static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){
3084 char enc[64], *ebuf, *rbuf;
3085 assert(doc && name && value);
3086 ebuf = cbmimedecode(value, enc);
3087 if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){
3088 est_doc_add_attr(doc, name, rbuf);
3089 free(rbuf);
3090 }
3091 free(ebuf);
3092 }
3093
3094
3095 /* generate a document with random text */
3096 static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode){
3097 ESTDOC *doc;
3098 char *str;
3099 int i;
3100 doc = est_doc_new();
3101 snum *= pow(est_random_nd() + 0.5, 3.0);
3102 if(mode == RD_RAND){
3103 mode = est_random() * 100;
3104 if(mode < 20){
3105 mode = RD_ENG;
3106 est_doc_add_attr(doc, "mode", "english");
3107 } else if(mode < 40){
3108 mode = RD_LAT;
3109 est_doc_add_attr(doc, "mode", "latin");
3110 } else if(mode < 60){
3111 mode = RD_EURO;
3112 est_doc_add_attr(doc, "mode", "euromix");
3113 } else if(mode < 65){
3114 mode = RD_ORI;
3115 est_doc_add_attr(doc, "mode", "oriental");
3116 } else if(mode < 95){
3117 mode = RD_JPN;
3118 est_doc_add_attr(doc, "mode", "japanese");
3119 } else {
3120 mode = RD_CHAO;
3121 est_doc_add_attr(doc, "mode", "chaos");
3122 }
3123 }
3124 switch(mode){
3125 case RD_ENG: est_doc_add_attr(doc, "mode", "english"); break;
3126 case RD_LAT: est_doc_add_attr(doc, "mode", "latin"); break;
3127 case RD_ORI: est_doc_add_attr(doc, "mode", "oriental"); break;
3128 case RD_JPN: est_doc_add_attr(doc, "mode", "japanese"); break;
3129 case RD_EURO: est_doc_add_attr(doc, "mode", "euromix"); break;
3130 case RD_CHAO: est_doc_add_attr(doc, "mode", "chaos"); break;
3131 }
3132 for(i = 0; i <= snum; i++){
3133 str = est_random_str(cnum, mode);
3134 if(est_random() < 0.05){
3135 est_doc_add_hidden_text(doc, str);
3136 } else {
3137 est_doc_add_text(doc, str);
3138 }
3139 free(str);
3140 }
3141 return doc;
3142 }
3143
3144
3145 /* generate random string */
3146 static char *est_random_str(int cnum, int mode){
3147 const char echrs[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
3148 CBDATUM *buf;
3149 char wc[2], *str;
3150 int i, c, wlen, dec, mm, big, n;
3151 buf = cbdatumopen("", 0);
3152 cnum *= pow(est_random_nd() + 0.5, 3.0);
3153 wlen = est_random_nd() * 8 + 4;
3154 dec = (int)(est_random() * INT_MAX) % 10;
3155 big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3156 for(i = 0; i < cnum; i++){
3157 switch(mode){
3158 case RD_ENG: case RD_LAT: case RD_EURO:
3159 mm = (int)(est_random() * INT_MAX) % 100;
3160 if((mode == RD_LAT || mode == RD_EURO) && mm < 5){
3161 c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3162 } else if(mode == RD_EURO && (mm < 30 || dec > 8)){
3163 if(dec % 2 == 0){
3164 c = 0x0391 + (int)(pow(est_random_nd(), 2.0) * (0x03d6 - 0x0391));
3165 } else {
3166 c = 0x0400 + (int)(pow(est_random_nd(), 2.0) * (0x045f - 0x0400));
3167 }
3168 } else if(mm < 95){
3169 if((n = est_random_nd() * (sizeof(echrs) - 1)) == (sizeof(echrs) - 1)) n = 0;
3170 c = echrs[n];
3171 } else {
3172 c = (int)(est_random() * ('@' - ' ')) + ' ';
3173 }
3174 if(--wlen < 1){
3175 c = ' ';
3176 wlen = pow(est_random_nd(), 3.0) * 8 + 4;
3177 dec = (int)(est_random() * INT_MAX) % 10;
3178 }
3179 break;
3180 case RD_ORI:
3181 c = big + est_random_nd() * 0x100;
3182 if(--wlen < 1){
3183 wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3184 big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3185 }
3186 break;
3187 case RD_JPN:
3188 if(dec < 4){
3189 c = 0x3041 + pow(est_random_nd(), 3.0) * (0x3094 - 0x3041);
3190 } else if(dec < 7){
3191 c = 0x30a1 + pow(est_random_nd(), 3.0) * (0x30fe - 0x30a1);
3192 } else if(dec < 9){
3193 c = 0x4e00 + pow(est_random_nd(), 3.0) * (0x9faf - 0x4e00);
3194 } else {
3195 if(est_random() < 0.7){
3196 c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3197 } else {
3198 c = 0x3041 + est_random() * (0xffef - 0x3041);
3199 }
3200 }
3201 if(--wlen < 1){
3202 wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3203 dec = (int)(est_random() * INT_MAX) % 10;
3204 }
3205 break;
3206 default:
3207 if(est_random() < 0.2){
3208 c = 0x00a1 + (int)est_random() * (0x00ff - 0x00a0);
3209 } else {
3210 c = (int)(est_random() * 0x10000);
3211 }
3212 break;
3213 }
3214 if(c <= 0 || c >= 0x10000) c = 0x0020;
3215 wc[0] = c / 0x100;
3216 wc[1] = c % 0x100;
3217 cbdatumcat(buf, wc, 2);
3218 }
3219 str = est_iconv(CB_DATUMPTR(buf), CB_DATUMSIZE(buf), "UTF-16BE", "UTF-8", NULL, NULL);
3220 cbdatumclose(buf);
3221 return str;
3222 }
3223
3224
3225
3226 /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26