/[hyperestraier]/trunk/estcmd.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/estcmd.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (hide annotations)
Fri Jul 29 21:56:53 2005 UTC (18 years, 10 months ago) by dpavlin
Original Path: upstream/0.5.1/estcmd.c
File MIME type: text/plain
File size: 105537 byte(s)
import of HyperEstraier 0.5.1

1 dpavlin 2 /*************************************************************************************************
2     * The command line interface for the core API
3     * Copyright (C) 2004-2005 Mikio Hirabayashi
4     * This file is part of Hyper Estraier.
5     * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6     * the GNU Lesser General Public License as published by the Free Software Foundation; either
7     * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8     * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10     * License for more details.
11     * You should have received a copy of the GNU Lesser General Public License along with Hyper
12     * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13     * Boston, MA 02111-1307 USA.
14     *************************************************************************************************/
15    
16    
17     #include "estraier.h"
18     #include "myconf.h"
19    
20     #define NUMBUFSIZ 32 /* size of a buffer for a number */
21     #define URIBUFSIZ 8192 /* size of a buffer for an URI */
22     #define MINIBNUM 31 /* bucket number of a small map */
23     #define SEARCHMAX 10 /* maximum number of shown documents */
24     #define SNIPWWIDTH 480 /* whole width of the snippet */
25     #define SNIPHWIDTH 96 /* width of beginning of the text */
26     #define SNIPAWIDTH 96 /* width around each highlighted word */
27     #define CACHEMAX (512*1024*1024) /* max chache size by mega bytes */
28     #define DATTRLPATH "_lpath" /* name of the attribute of the local path */
29     #define DATTRLFILE "_lfile" /* name of the attribute of the local file name */
30     #define DATTRSCORE "#score" /* name of the pseudo-attribute of score */
31     #define DATTRKWORDS "#kwords" /* name of the pseudo-attribute of keywords */
32     #define KWDBNAME "kwords" /* name of the database for keywords */
33     #define KWDBBNUM 122869 /* bucket number of the keyword database */
34     #define KWDBDNUM 3 /* division number of the keyword database */
35     #define KWORDNUM 32 /* number of shown keywords */
36     #define RDOCSNUM 6 /* number of sections of a raondom document */
37     #define RDOCCNUM 256 /* number of characters for int a section */
38    
39     enum { /* enumeration for viewing modes */
40     VM_ID, /* ID only */
41     VM_URI, /* ID and URI */
42     VM_ATTR, /* all attributes */
43     VM_FULL, /* all attributes and body text */
44     VM_SNIP, /* all attributes and snippet */
45     VM_HMRD, /* human readable */
46     VM_XML, /* XML */
47     VM_DUMP /* dump draft files */
48     };
49    
50     enum { /* enumeration for file formats */
51     FF_AUTO, /* automatic detection */
52     FF_DRAFT, /* draft */
53     FF_TEXT, /* plain text */
54     FF_HTML, /* HTML */
55     FF_MIME, /* MIME */
56     FF_NONE /* ignored */
57     };
58    
59     enum { /* enumeration for test documents */
60     RD_ENG, /* English */
61     RD_LAT, /* Latin */
62     RD_EURO, /* European mix */
63     RD_ORI, /* Oriental */
64     RD_JPN, /* Japanese */
65     RD_CHAO, /* chaos */
66     RD_RAND /* selected at random */
67     };
68    
69    
70     /* global variables */
71     const char *g_progname; /* program name */
72     int g_sigterm = FALSE; /* flag for termination signal */
73     int g_putopts = 0; /* options of registration */
74     int g_outopts = 0; /* options of deletion */
75     int g_optopts = 0; /* options of optimization */
76     const char *g_inputcode = "UTF-8"; /* input encoding */
77     int g_inputlang = ESTLANGEN; /* prefered language */
78     const char *g_pathcode = NULL; /* path encoding */
79     int g_pathfull = FALSE; /* whether to record full paths */
80     int g_oextmodes = 0; /* extra open modes */
81     int g_viewmode = VM_ID; /* viewing mode */
82     int g_filefmt = FF_AUTO; /* file format */
83     CBMAP *g_xcmdmap = NULL; /* map of suffixes and filter commands */
84     int g_filtorig = FALSE; /* whether to use filter for original files */
85     int g_stdate = FALSE; /* whether to adopt date by stat */
86     int g_chkmdate = FALSE; /* whether to check modification date */
87     double g_cachesize = -1; /* size of the cache */
88     int g_doforce = FALSE; /* whether to force purging or extracting */
89     int g_kwordnum = KWORDNUM; /* number of keywords */
90     int g_condopts = 0; /* options of the search condtion */
91     int g_rdmode = RD_RAND; /* mode of random documents */
92    
93    
94     /* function prototypes */
95     int main(int argc, char **argv);
96     static void printferror(const char *format, ...);
97     static void printfinfo(const char *format, ...);
98     static void dbinform(const char *msg);
99     static void setsignals(void);
100     static void sigtermhandler(int num);
101     static void usage(void);
102     static int runput(int argc, char **argv);
103     static int runout(int argc, char **argv);
104     static int runget(int argc, char **argv);
105     static int runlist(int argc, char **argv);
106     static int runuriid(int argc, char **argv);
107     static int runmeta(int argc, char **argv);
108     static int runinform(int argc, char **argv);
109     static int runoptimize(int argc, char **argv);
110     static int runsearch(int argc, char **argv);
111     static int rungather(int argc, char **argv);
112     static int runpurge(int argc, char **argv);
113     static int runextkeys(int argc, char **argv);
114     static int rundraft(int argc, char **argv);
115     static int runbreak(int argc, char **argv);
116     static int runrandput(int argc, char **argv);
117     static int runwicked(int argc, char **argv);
118     static int runregression(int argc, char **argv);
119     static int procput(const char *dbname, const char *filename);
120     static int procout(const char *dbname, int id, const char *expr);
121     static int procget(const char *dbname, int id, const char *expr, const char *attr);
122     static int proclist(const char *dbname);
123     static int procuriid(const char *dbname, const char *uri);
124     static int procmeta(const char *dbname, const char *mname, const char *mvalue);
125     static int procinform(const char *dbname);
126     static int procoptimize(const char *dbname);
127     static int procsearch(const char *dbname, const char *phrase,
128     const CBLIST *attrs, const char *ord, int max, int sim);
129     static int procgather(const char *dbname, const char *filename);
130     static int procpurge(const char *dbname, const char *prefix);
131     static int procextkeys(const char *dbname, const char *prefix, int ni);
132     static int procdraft(const char *filename);
133     static int procbreak(const char *filename, int wt);
134     static int procrandput(const char *dbname, int dnum);
135     static int procwicked(const char *dbname, int dnum);
136     static int procregression(const char *dbname);
137     static void xmlprintf(const char *format, ...);
138     static int strtolang(const char *str);
139     static char *fgetl(FILE *ifp);
140     static int doputdoc(ESTDB *db, const char *path);
141     static const char *pathtourl(const char *path);
142     static const char *urltofile(const char *uri);
143     static char *urltopath(const char *uri);
144     static CBMAP *vectorizer(void *db, int id, void *kwdb);
145     static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
146     const char *xcmd, const char *tmpdir,
147     const char *penc, int plang);
148     static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc);
149     static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang);
150     static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang);
151     static char *est_html_enc(const char *str);
152     static char *est_html_raw_text(const char *html);
153     static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang);
154     static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value);
155     static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode);
156     static char *est_random_str(int cnum, int mode);
157    
158    
159     /* main routine */
160     int main(int argc, char **argv){
161     const char *tmp;
162     int rv;
163     if((tmp = getenv("ESTDBGFD")) != NULL) dpdbgfd = atoi(tmp);
164     cbstdiobin();
165     g_progname = argv[0];
166     g_sigterm = FALSE;
167     if(argc < 2) usage();
168     rv = 0;
169     if(!strcmp(argv[1], "put")){
170     setsignals();
171     rv = runput(argc, argv);
172     } else if(!strcmp(argv[1], "out")){
173     setsignals();
174     rv = runout(argc, argv);
175     } else if(!strcmp(argv[1], "get")){
176     rv = runget(argc, argv);
177     } else if(!strcmp(argv[1], "list")){
178     rv = runlist(argc, argv);
179     } else if(!strcmp(argv[1], "uriid")){
180     rv = runuriid(argc, argv);
181     } else if(!strcmp(argv[1], "meta")){
182     setsignals();
183     rv = runmeta(argc, argv);
184     } else if(!strcmp(argv[1], "inform")){
185     rv = runinform(argc, argv);
186     } else if(!strcmp(argv[1], "optimize")){
187     setsignals();
188     rv = runoptimize(argc, argv);
189     } else if(!strcmp(argv[1], "search")){
190     rv = runsearch(argc, argv);
191     } else if(!strcmp(argv[1], "gather")){
192     setsignals();
193     rv = rungather(argc, argv);
194     } else if(!strcmp(argv[1], "purge")){
195     setsignals();
196     rv = runpurge(argc, argv);
197     } else if(!strcmp(argv[1], "extkeys")){
198     setsignals();
199     rv = runextkeys(argc, argv);
200     } else if(!strcmp(argv[1], "draft")){
201     rv = rundraft(argc, argv);
202     } else if(!strcmp(argv[1], "break")){
203     rv = runbreak(argc, argv);
204     } else if(!strcmp(argv[1], "randput")){
205     setsignals();
206     rv = runrandput(argc, argv);
207     } else if(!strcmp(argv[1], "wicked")){
208     setsignals();
209     rv = runwicked(argc, argv);
210     } else if(!strcmp(argv[1], "regression")){
211     setsignals();
212     rv = runregression(argc, argv);
213     } else if(!strcmp(argv[1], "version") || !strcmp(argv[1], "--version")){
214     printf("Hyper Estraier %s on %s\n", est_version, ESTSYSNAME);
215     printf("Copyright (C) 2004-2005 Mikio Hirabayashi.\n");
216     rv = 0;
217     } else {
218     usage();
219     }
220     return rv;
221     }
222    
223    
224     /* print formatted error string and flush the buffer */
225     static void printferror(const char *format, ...){
226     va_list ap;
227     va_start(ap, format);
228     fprintf(stderr, "%s: ERROR: ", g_progname);
229     vfprintf(stderr, format, ap);
230     fputc('\n', stderr);
231     fflush(stderr);
232     va_end(ap);
233     }
234    
235    
236     /* print formatted information string and flush the buffer */
237     static void printfinfo(const char *format, ...){
238     va_list ap;
239     va_start(ap, format);
240     printf("%s: INFO: ", g_progname);
241     vprintf(format, ap);
242     putchar('\n');
243     fflush(stdout);
244     va_end(ap);
245     }
246    
247    
248     /* callback function for database events */
249     static void dbinform(const char *msg){
250     printfinfo("%s", msg);
251     }
252    
253    
254     /* set signal handlers */
255     static void setsignals(void){
256     signal(1, sigtermhandler);
257     signal(2, sigtermhandler);
258     signal(3, sigtermhandler);
259     signal(13, sigtermhandler);
260     signal(15, sigtermhandler);
261     }
262    
263    
264     /* handler of termination signal */
265     static void sigtermhandler(int num){
266     static int tries = 0;
267     if(tries++ <= 4){
268     signal(num, sigtermhandler);
269     } else {
270     signal(num, SIG_DFL);
271     }
272     g_sigterm = TRUE;
273     printfinfo("the termination signal %d catched", num);
274     }
275    
276    
277     /* print the usage and exit */
278     static void usage(void){
279     fprintf(stderr, "%s: command line utility for the core API of Hyper Estraier\n", g_progname);
280     fprintf(stderr, "\n");
281     fprintf(stderr, "usage:\n");
282     fprintf(stderr, " %s put [-cl] db [file]\n", g_progname);
283     fprintf(stderr, " %s out [-cl] db expr\n", g_progname);
284     fprintf(stderr, " %s get db expr\n", g_progname);
285     fprintf(stderr, " %s list db\n", g_progname);
286     fprintf(stderr, " %s uriid db uri\n", g_progname);
287     fprintf(stderr, " %s meta db [name [value]]\n", g_progname);
288     fprintf(stderr, " %s inform db\n", g_progname);
289     fprintf(stderr, " %s optimize [-onp] [-ond] db\n", g_progname);
290     fprintf(stderr, " %s search [-ic enc] [-vu|-va|-vf|-vs|-vh|-vx|-dd] [-gs|-gf|-ga]"
291     " [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n",
292     g_progname);
293     fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]"
294     " [-ic enc] [-il lang] [-pc enc] [-pf] [-apn] [-sd] [-cm] [-cs num] db [file|dir]\n",
295     g_progname);
296     fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname);
297     fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname);
298     fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname);
299     fprintf(stderr, " %s break [-ic enc] [-il lang] [-apn] [-wt] [file]\n", g_progname);
300     fprintf(stderr, " %s randput [-ren|-rla|-reu|-ror|-rjp|-rch] [-cs num] db dnum\n",
301     g_progname);
302     fprintf(stderr, " %s wicked db dnum\n", g_progname);
303     fprintf(stderr, " %s regression db\n", g_progname);
304     fprintf(stderr, " %s version\n", g_progname);
305     fprintf(stderr, "\n");
306     exit(1);
307     }
308    
309    
310     /* parse arguments of the put command */
311     static int runput(int argc, char **argv){
312     char *dbname, *filename;
313     int i, rv;
314     dbname = NULL;
315     filename = NULL;
316     for(i = 2; i < argc; i++){
317     if(!dbname && argv[i][0] == '-'){
318     if(!strcmp(argv[i], "-cl")){
319     g_putopts |= ESTPDCLEAN;
320     } else {
321     usage();
322     }
323     } else if(!dbname){
324     dbname = argv[i];
325     } else if(!filename){
326     filename = argv[i];
327     } else {
328     usage();
329     }
330     }
331     if(!dbname) usage();
332     rv = procput(dbname, filename);
333     return rv;
334     }
335    
336    
337     /* parse arguments of the out command */
338     static int runout(int argc, char **argv){
339     char *dbname, *expr;
340     int i, id, rv;
341     dbname = NULL;
342     expr = NULL;
343     for(i = 2; i < argc; i++){
344     if(!dbname && argv[i][0] == '-'){
345     if(!strcmp(argv[i], "-cl")){
346     g_outopts |= ESTODCLEAN;
347     } else {
348     usage();
349     }
350     } else if(!dbname){
351     dbname = argv[i];
352     } else if(!expr){
353     expr = argv[i];
354     } else {
355     usage();
356     }
357     }
358     if(!dbname || !expr) usage();
359     if((id = atoi(expr)) > 0) expr = NULL;
360     rv = procout(dbname, id, expr);
361     return rv;
362     }
363    
364    
365     /* parse arguments of the get command */
366     static int runget(int argc, char **argv){
367     char *dbname, *expr, *attr;
368     int i, id, rv;
369     dbname = NULL;
370     expr = NULL;
371     attr = NULL;
372     for(i = 2; i < argc; i++){
373     if(!dbname && argv[i][0] == '-'){
374     usage();
375     } else if(!dbname){
376     dbname = argv[i];
377     } else if(!expr){
378     expr = argv[i];
379     } else if(!attr){
380     attr = argv[i];
381     } else {
382     usage();
383     }
384     }
385     if(!dbname || !expr) usage();
386     if((id = atoi(expr)) > 0) expr = NULL;
387     rv = procget(dbname, id, expr, attr);
388     return rv;
389     }
390    
391    
392     /* parse arguments of the list command */
393     static int runlist(int argc, char **argv){
394     char *dbname;
395     int i, rv;
396     dbname = NULL;
397     for(i = 2; i < argc; i++){
398     if(!dbname && argv[i][0] == '-'){
399     usage();
400     } else if(!dbname){
401     dbname = argv[i];
402     } else {
403     usage();
404     }
405     }
406     if(!dbname) usage();
407     rv = proclist(dbname);
408     return rv;
409     }
410    
411    
412     /* parse arguments of the uriid command */
413     static int runuriid(int argc, char **argv){
414     char *dbname, *uri;
415     int i, rv;
416     dbname = NULL;
417     uri = NULL;
418     for(i = 2; i < argc; i++){
419     if(!dbname && argv[i][0] == '-'){
420     usage();
421     } else if(!dbname){
422     dbname = argv[i];
423     } else if(!uri){
424     uri = argv[i];
425     } else {
426     usage();
427     }
428     }
429     if(!dbname || !uri) usage();
430     rv = procuriid(dbname, uri);
431     return rv;
432     }
433    
434    
435     /* parse arguments of the meta command */
436     static int runmeta(int argc, char **argv){
437     char *dbname, *mname, *mvalue;
438     int i, del, rv;
439     dbname = NULL;
440     mname = NULL;
441     mvalue = NULL;
442     del = FALSE;
443     for(i = 2; i < argc; i++){
444     if(!dbname && argv[i][0] == '-'){
445     usage();
446     } else if(!dbname){
447     dbname = argv[i];
448     } else if(!mname){
449     mname = argv[i];
450     } else if(!mvalue){
451     mvalue = argv[i];
452     } else {
453     usage();
454     }
455     }
456     if(!dbname) usage();
457     rv = procmeta(dbname, mname, mvalue);
458     return rv;
459     }
460    
461    
462     /* parse arguments of the inform command */
463     static int runinform(int argc, char **argv){
464     char *dbname;
465     int i, rv;
466     dbname = NULL;
467     for(i = 2; i < argc; i++){
468     if(!dbname && argv[i][0] == '-'){
469     usage();
470     } else if(!dbname){
471     dbname = argv[i];
472     } else {
473     usage();
474     }
475     }
476     if(!dbname) usage();
477     rv = procinform(dbname);
478     return rv;
479     }
480    
481    
482     /* parse arguments of the optimize command */
483     static int runoptimize(int argc, char **argv){
484     char *dbname;
485     int i, rv;
486     dbname = NULL;
487     for(i = 2; i < argc; i++){
488     if(!dbname && argv[i][0] == '-'){
489     if(!strcmp(argv[i], "-onp")){
490     g_optopts |= ESTOPTNOPURGE;
491     } else if(!strcmp(argv[i], "-ond")){
492     g_optopts |= ESTOPTNODBOPT;
493     } else {
494     usage();
495     }
496     } else if(!dbname){
497     dbname = argv[i];
498     } else {
499     usage();
500     }
501     }
502     if(!dbname) usage();
503     rv = procoptimize(dbname);
504     return rv;
505     }
506    
507    
508     /* parse arguments of the search command */
509     static int runsearch(int argc, char **argv){
510     CBDATUM *pbuf;
511     CBLIST *attrs;
512     char *dbname, *ord, *phrase, *tmp;
513     int i, max, sim, rv;
514     dbname = NULL;
515     ord = NULL;
516     max = SEARCHMAX;
517     sim = -1;
518     pbuf = cbdatumopen("", 0);
519     cbglobalgc(pbuf, (void (*)(void *))cbdatumclose);
520     attrs = cblistopen();
521     cbglobalgc(attrs, (void (*)(void *))cblistclose);
522     for(i = 2; i < argc; i++){
523     if(!dbname && argv[i][0] == '-'){
524     if(!strcmp(argv[i], "-ic")){
525     if(++i >= argc) usage();
526     g_inputcode = argv[i];
527     } else if(!strcmp(argv[i], "-gs")){
528     g_condopts |= ESTCONDSURE;
529     } else if(!strcmp(argv[i], "-gf")){
530     g_condopts |= ESTCONDFAST;
531     } else if(!strcmp(argv[i], "-ga")){
532     g_condopts |= ESTCONDAGIT;
533     } else if(!strcmp(argv[i], "-ni")){
534     g_condopts |= ESTCONDNOIDF;
535     } else if(!strcmp(argv[i], "-sf")){
536     g_condopts |= ESTCONDSIMPLE;
537     } else if(!strcmp(argv[i], "-hs")){
538     g_condopts |= ESTCONDSCFB;
539     } else if(!strcmp(argv[i], "-vu")){
540     g_viewmode = VM_URI;
541     } else if(!strcmp(argv[i], "-va")){
542     g_viewmode = VM_ATTR;
543     } else if(!strcmp(argv[i], "-vf")){
544     g_viewmode = VM_FULL;
545     } else if(!strcmp(argv[i], "-vs")){
546     g_viewmode = VM_SNIP;
547     } else if(!strcmp(argv[i], "-vh")){
548     g_viewmode = VM_HMRD;
549     } else if(!strcmp(argv[i], "-vx")){
550     g_viewmode = VM_XML;
551     } else if(!strcmp(argv[i], "-dd")){
552     g_viewmode = VM_DUMP;
553     } else if(!strcmp(argv[i], "-attr")){
554     if(++i >= argc) usage();
555     cblistpush(attrs, argv[i], -1);
556     } else if(!strcmp(argv[i], "-ord")){
557     if(++i >= argc) usage();
558     ord = argv[i];
559     } else if(!strcmp(argv[i], "-max")){
560     if(++i >= argc) usage();
561     max = atoi(argv[i]);
562     } else if(!strcmp(argv[i], "-sim")){
563     if(++i >= argc) usage();
564     sim = atoi(argv[i]);
565     } else {
566     usage();
567     }
568     } else if(!dbname){
569     dbname = argv[i];
570     } else {
571     if(cbdatumsize(pbuf) > 0) cbdatumcat(pbuf, " ", 1);
572     cbdatumcat(pbuf, argv[i], -1);
573     }
574     }
575     if(!dbname) usage();
576     if(!(phrase = est_iconv(cbdatumptr(pbuf), -1, g_inputcode, "UTF-8", NULL, NULL))){
577     printferror("%s: unsupported encoding\n", g_inputcode);
578     return 1;
579     }
580     cbstrtrim(phrase);
581     for(i = 0; i < cblistnum(attrs); i++){
582     if((tmp = est_iconv(cblistval(attrs, i, NULL), -1, g_inputcode, "UTF-8", NULL, NULL)) != NULL){
583     cblistover(attrs, i, tmp, -1);
584     free(tmp);
585     }
586     }
587     rv = procsearch(dbname, phrase, attrs, ord, max, sim);
588     free(phrase);
589     return rv;
590     }
591    
592    
593     /* parse arguments of the gather command */
594     static int rungather(int argc, char **argv){
595     CBLIST *list;
596     const char *elem;
597     char *dbname, *filename;
598     int i, j, rv;
599     g_xcmdmap = cbmapopenex(MINIBNUM);
600     cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose);
601     dbname = NULL;
602     filename = NULL;
603     g_inputcode = NULL;
604     for(i = 2; i < argc; i++){
605     if(!dbname && argv[i][0] == '-'){
606     if(!strcmp(argv[i], "-cl")){
607     g_putopts |= ESTPDCLEAN;
608     } else if(!strcmp(argv[i], "-fe")){
609     g_filefmt = FF_DRAFT;
610     } else if(!strcmp(argv[i], "-ft")){
611     g_filefmt = FF_TEXT;
612     } else if(!strcmp(argv[i], "-fh")){
613     g_filefmt = FF_HTML;
614     } else if(!strcmp(argv[i], "-fm")){
615     g_filefmt = FF_MIME;
616     } else if(!strcmp(argv[i], "-fx")){
617     if((i += 2) >= argc) usage();
618     list = cbsplit(argv[i-1], -1, ",");
619     for(j = 0; j < cblistnum(list); j++){
620     elem = cblistval(list, j, NULL);
621     if(elem[0] != '\0') cbmapput(g_xcmdmap, elem, -1, argv[i], -1, FALSE);
622     }
623     cblistclose(list);
624     } else if(!strcmp(argv[i], "-fz")){
625     g_filefmt = FF_NONE;
626     } else if(!strcmp(argv[i], "-fo")){
627     g_filtorig = TRUE;
628     } else if(!strcmp(argv[i], "-ic")){
629     if(++i >= argc) usage();
630     g_inputcode = argv[i];
631     } else if(!strcmp(argv[i], "-il")){
632     if(++i >= argc) usage();
633     g_inputlang = strtolang(argv[i]);
634     } else if(!strcmp(argv[i], "-pc")){
635     if(++i >= argc) usage();
636     g_pathcode = argv[i];
637     } else if(!strcmp(argv[i], "-pf")){
638     g_pathfull = TRUE;
639     } else if(!strcmp(argv[i], "-apn")){
640     g_oextmodes |= ESTDBPERFNG;
641     } else if(!strcmp(argv[i], "-sd")){
642     g_stdate = TRUE;
643     } else if(!strcmp(argv[i], "-cm")){
644     g_chkmdate = TRUE;
645     } else if(!strcmp(argv[i], "-cs")){
646     if(++i >= argc) usage();
647     g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
648     } else {
649     usage();
650     }
651     } else if(!dbname){
652     dbname = argv[i];
653     } else if(!filename){
654     filename = argv[i];
655     } else {
656     usage();
657     }
658     }
659     if(!dbname || !filename) usage();
660     rv = procgather(dbname, filename);
661     return rv;
662     }
663    
664    
665     /* parse arguments of the purge command */
666     static int runpurge(int argc, char **argv){
667     char *dbname, *prefix;
668     int i, rv;
669     dbname = NULL;
670     prefix = NULL;
671     for(i = 2; i < argc; i++){
672     if(!dbname && argv[i][0] == '-'){
673     if(!strcmp(argv[i], "-cl")){
674     g_outopts |= ESTODCLEAN;
675     } else if(!strcmp(argv[i], "-fc")){
676     g_doforce = TRUE;
677     } else {
678     usage();
679     }
680     } else if(!dbname){
681     dbname = argv[i];
682     } else if(!prefix){
683     prefix = argv[i];
684     } else {
685     usage();
686     }
687     }
688     if(!dbname) usage();
689     rv = procpurge(dbname, prefix);
690     return rv;
691     }
692    
693    
694     /* parse arguments of the extkeys command */
695     static int runextkeys(int argc, char **argv){
696     char *dbname, *prefix;
697     int i, ni, rv;
698     dbname = NULL;
699     prefix = NULL;
700     ni = FALSE;
701     for(i = 2; i < argc; i++){
702     if(!dbname && argv[i][0] == '-'){
703     if(!strcmp(argv[i], "-fc")){
704     g_doforce = TRUE;
705     } else if(!strcmp(argv[i], "-ni")){
706     ni = TRUE;
707     } else if(!strcmp(argv[i], "-kn")){
708     if(++i >= argc) usage();
709     g_kwordnum = atoi(argv[i]);
710     } else {
711     usage();
712     }
713     } else if(!dbname){
714     dbname = argv[i];
715     } else if(!prefix){
716     prefix = argv[i];
717     } else {
718     usage();
719     }
720     }
721     if(!dbname || g_kwordnum < 1) usage();
722     rv = procextkeys(dbname, prefix, ni);
723     return rv;
724     }
725    
726    
727     /* parse arguments of the draft command */
728     static int rundraft(int argc, char **argv){
729     char *filename;
730     int i, rv;
731     filename = NULL;
732     g_filefmt = FF_DRAFT;
733     g_inputcode = NULL;
734     for(i = 2; i < argc; i++){
735     if(!filename && argv[i][0] == '-'){
736     if(!strcmp(argv[i], "-ft")){
737     g_filefmt = FF_TEXT;
738     } else if(!strcmp(argv[i], "-fh")){
739     g_filefmt = FF_HTML;
740     } else if(!strcmp(argv[i], "-fm")){
741     g_filefmt = FF_MIME;
742     } else if(!strcmp(argv[i], "-ic")){
743     if(++i >= argc) usage();
744     g_inputcode = argv[i];
745     } else if(!strcmp(argv[i], "-il")){
746     if(++i >= argc) usage();
747     g_inputlang = strtolang(argv[i]);
748     } else {
749     usage();
750     }
751     } else if(!filename){
752     filename = argv[i];
753     } else {
754     usage();
755     }
756     }
757     rv = procdraft(filename);
758     return rv;
759     }
760    
761    
762     /* parse arguments of the break command */
763     static int runbreak(int argc, char **argv){
764     char *filename;
765     int i, wt, rv;
766     filename = NULL;
767     wt = FALSE;
768     for(i = 2; i < argc; i++){
769     if(!filename && argv[i][0] == '-'){
770     if(!strcmp(argv[i], "-ic")){
771     if(++i >= argc) usage();
772     g_inputcode = argv[i];
773     } else if(!strcmp(argv[i], "-il")){
774     if(++i >= argc) usage();
775     g_inputlang = strtolang(argv[i]);
776     } else if(!strcmp(argv[i], "-apn")){
777     g_oextmodes |= ESTDBPERFNG;
778     } else if(!strcmp(argv[i], "-wt")){
779     wt = TRUE;
780     } else {
781     usage();
782     }
783     } else if(!filename){
784     filename = argv[i];
785     } else {
786     usage();
787     }
788     }
789     rv = procbreak(filename, wt);
790     return rv;
791     }
792    
793    
794     /* parse arguments of the randput command */
795     static int runrandput(int argc, char **argv){
796     char *dbname, *dnstr;
797     int i, dnum, rv;
798     dbname = NULL;
799     dnstr = NULL;
800     for(i = 2; i < argc; i++){
801     if(!dbname && argv[i][0] == '-'){
802     if(!strcmp(argv[i], "-ren")){
803     g_rdmode = RD_ENG;
804     } else if(!strcmp(argv[i], "-rla")){
805     g_rdmode = RD_LAT;
806     } else if(!strcmp(argv[i], "-reu")){
807     g_rdmode = RD_EURO;
808     } else if(!strcmp(argv[i], "-ror")){
809     g_rdmode = RD_ORI;
810     } else if(!strcmp(argv[i], "-rjp")){
811     g_rdmode = RD_JPN;
812     } else if(!strcmp(argv[i], "-rch")){
813     g_rdmode = RD_CHAO;
814     } else if(!strcmp(argv[i], "-cs")){
815     if(++i >= argc) usage();
816     g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
817     } else {
818     usage();
819     }
820     } else if(!dbname){
821     dbname = argv[i];
822     } else if(!dnstr){
823     dnstr = argv[i];
824     } else {
825     usage();
826     }
827     }
828     if(!dbname || !dnstr) usage();
829     if((dnum = atoi(dnstr)) < 1) usage();
830     rv = procrandput(dbname, dnum);
831     return rv;
832     }
833    
834    
835     /* parse arguments of the wicked command */
836     static int runwicked(int argc, char **argv){
837     char *dbname, *dnstr;
838     int i, dnum, rv;
839     dbname = NULL;
840     dnstr = NULL;
841     for(i = 2; i < argc; i++){
842     if(!dbname && argv[i][0] == '-'){
843     usage();
844     } else if(!dbname){
845     dbname = argv[i];
846     } else if(!dnstr){
847     dnstr = argv[i];
848     } else {
849     usage();
850     }
851     }
852     if(!dbname || !dnstr) usage();
853     if((dnum = atoi(dnstr)) < 1) usage();
854     rv = procwicked(dbname, dnum);
855     return rv;
856     }
857    
858    
859     /* parse arguments of the regression command */
860     static int runregression(int argc, char **argv){
861     char *dbname;
862     int i, rv;
863     dbname = NULL;
864     for(i = 2; i < argc; i++){
865     if(!dbname && argv[i][0] == '-'){
866     usage();
867     } else if(!dbname){
868     dbname = argv[i];
869     } else {
870     usage();
871     }
872     }
873     if(!dbname) usage();
874     rv = procregression(dbname);
875     return rv;
876     }
877    
878    
879     /* perform the put command */
880     static int procput(const char *dbname, const char *filename){
881     ESTDB *db;
882     ESTDOC *doc;
883     const char *uri;
884     char *draft;
885     int ecode;
886     if(!(draft = cbreadfile(filename, NULL))){
887     printferror("%s: could not open", filename ? filename : "(stdin)");
888     return 1;
889     }
890     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT, &ecode))){
891     printferror("%s: %s", dbname, est_err_msg(ecode));
892     free(draft);
893     return 1;
894     }
895     est_db_set_informer(db, dbinform);
896     doc = est_doc_new_from_draft(draft);
897     if(!est_db_put_doc(db, doc, g_putopts)){
898     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
899     est_doc_delete(doc);
900     est_db_close(db, &ecode);
901     free(draft);
902     return 1;
903     }
904     if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
905     printfinfo("%d (%s): registered", est_doc_id(doc), uri);
906     est_doc_delete(doc);
907     if(!est_db_close(db, &ecode)){
908     printferror("%s: %s", dbname, est_err_msg(ecode));
909     free(draft);
910     return 1;
911     }
912     free(draft);
913     return 0;
914     }
915    
916    
917     /* perform the out command */
918     static int procout(const char *dbname, int id, const char *expr){
919     ESTDB *db;
920     int ecode;
921     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
922     printferror("%s: %s", dbname, est_err_msg(ecode));
923     return 1;
924     }
925     est_db_set_informer(db, dbinform);
926     if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
927     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
928     est_db_close(db, &ecode);
929     return 1;
930     }
931     if(!est_db_out_doc(db, id, g_outopts)){
932     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
933     est_db_close(db, &ecode);
934     return 1;
935     }
936     printfinfo("%d: deleted", id);
937     if(!est_db_close(db, &ecode)){
938     printferror("%s: %s", dbname, est_err_msg(ecode));
939     return 1;
940     }
941     return 0;
942     }
943    
944    
945     /* perform the get command */
946     static int procget(const char *dbname, int id, const char *expr, const char *attr){
947     ESTDB *db;
948     ESTDOC *doc;
949     char *draft;
950     int ecode;
951     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
952     printferror("%s: %s", dbname, est_err_msg(ecode));
953     return 1;
954     }
955     if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
956     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
957     est_db_close(db, &ecode);
958     return 1;
959     }
960     if(attr){
961     if(!(draft = est_db_get_doc_attr(db, id, attr))){
962     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
963     est_db_close(db, &ecode);
964     return 1;
965     }
966     printf("%s\n", draft);
967     free(draft);
968     } else {
969     if(!(doc = est_db_get_doc(db, id, 0))){
970     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
971     est_db_close(db, &ecode);
972     return 1;
973     }
974     draft = est_doc_dump_draft(doc);
975     printf("%s", draft);
976     free(draft);
977     est_doc_delete(doc);
978     }
979     if(!est_db_close(db, &ecode)){
980     printferror("%s: %s", dbname, est_err_msg(ecode));
981     return 1;
982     }
983     return 0;
984     }
985    
986    
987     /* perform the list command */
988     static int proclist(const char *dbname){
989     ESTDB *db;
990     ESTDOC *doc;
991     const char *vbuf;
992     int ecode, id;
993     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
994     printferror("%s: %s", dbname, est_err_msg(ecode));
995     return 1;
996     }
997     if(!est_db_iter_init(db)){
998     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
999     est_db_close(db, &ecode);
1000     return 1;
1001     }
1002     while((id = est_db_iter_next(db)) > 0){
1003     if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1004     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1005     printf("%d\t%s\n", id, vbuf);
1006     est_doc_delete(doc);
1007     }
1008     }
1009     if(!est_db_close(db, &ecode)){
1010     printferror("%s: %s", dbname, est_err_msg(ecode));
1011     return 1;
1012     }
1013     return 0;
1014     }
1015    
1016    
1017     /* perform the uriid command */
1018     static int procuriid(const char *dbname, const char *uri){
1019     ESTDB *db;
1020     int ecode, id;
1021     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1022     printferror("%s: %s", dbname, est_err_msg(ecode));
1023     return 1;
1024     }
1025     if((id = est_db_uri_to_id(db, uri)) == -1){
1026     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1027     est_db_close(db, &ecode);
1028     return 1;
1029     }
1030     printf("%d\n", id);
1031     if(!est_db_close(db, &ecode)){
1032     printferror("%s: %s", dbname, est_err_msg(ecode));
1033     return 1;
1034     }
1035     return 0;
1036     }
1037    
1038    
1039     /* perform the meta command */
1040     static int procmeta(const char *dbname, const char *mname, const char *mvalue){
1041     ESTDB *db;
1042     CBLIST *names;
1043     char *vbuf;
1044     int i, ecode;
1045     if(!(db = est_db_open(dbname, mvalue ? (ESTDBWRITER | ESTDBCREAT) : (ESTDBREADER | ESTDBLCKNB),
1046     &ecode))){
1047     printferror("%s: %s", dbname, est_err_msg(ecode));
1048     return 1;
1049     }
1050     if(mname){
1051     if(mvalue){
1052     est_db_add_meta(db, mname, mvalue[0] != '\0' ? mvalue : NULL);
1053     } else {
1054     if((vbuf = est_db_meta(db, mname)) != NULL){
1055     printf("%s\n", vbuf);
1056     free(vbuf);
1057     }
1058     }
1059     } else {
1060     names = est_db_meta_names(db);
1061     for(i = 0; i < cblistnum(names); i++){
1062     printf("%s\n", cblistval(names, i, NULL));
1063     }
1064     cblistclose(names);
1065     }
1066     if(!est_db_close(db, &ecode)){
1067     printferror("%s: %s", dbname, est_err_msg(ecode));
1068     return 1;
1069     }
1070     return 0;
1071     }
1072    
1073    
1074     /* perform the inform command */
1075     static int procinform(const char *dbname){
1076     ESTDB *db;
1077     int ecode;
1078     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1079     printferror("%s: %s", dbname, est_err_msg(ecode));
1080     return 1;
1081     }
1082     printf("number of documents: %d\n", est_db_doc_num(db));
1083     printf("number of words: %d\n", est_db_word_num(db));
1084     printf("file size: %.0f\n", est_db_size(db));
1085     if(!est_db_close(db, &ecode)){
1086     printferror("%s: %s", dbname, est_err_msg(ecode));
1087     return 1;
1088     }
1089     return 0;
1090     }
1091    
1092    
1093     /* perform the optimize command */
1094     static int procoptimize(const char *dbname){
1095     ESTDB *db;
1096     char path[URIBUFSIZ];
1097     int ecode;
1098     time_t curtime;
1099     curtime = time(NULL);
1100     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1101     printferror("%s: %s", dbname, est_err_msg(ecode));
1102     return 1;
1103     }
1104     est_db_set_informer(db, dbinform);
1105     sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1106     unlink(path);
1107     if(!est_db_optimize(db, g_optopts)){
1108     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1109     est_db_close(db, &ecode);
1110     return 1;
1111     }
1112     if(!est_db_close(db, &ecode)){
1113     printferror("%s: %s", dbname, est_err_msg(ecode));
1114     return 1;
1115     }
1116     curtime = time(NULL) - curtime;
1117     printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1118     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1119     return 0;
1120     }
1121    
1122    
1123     /* perform the search command */
1124     static int procsearch(const char *dbname, const char *phrase,
1125     const CBLIST *attrs, const char *ord, int max, int sim){
1126     ESTDB *db;
1127     ESTCOND *cond;
1128     ESTDOC *doc;
1129     CURIA *kwdb;
1130     CBDATUM *pbuf;
1131     CBMAP *svmap, *hints, *kwords;
1132     CBLIST *names, *words, *lines;
1133     const char *kbuf, *vbuf, *line;
1134     char *draft, path[URIBUFSIZ], numbuf[NUMBUFSIZ], *word, *pv;
1135     int i, j, ecode, ksiz, vsiz, *res, rnum, id, sc, fin, cnt;
1136     double curtime;
1137     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1138     printferror("%s: %s", dbname, est_err_msg(ecode));
1139     return 1;
1140     }
1141     sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1142     if((kwdb = cropen(path, CR_OREADER, -1, -1)) != NULL)
1143     est_db_set_vectorizer(db, vectorizer, kwdb);
1144     cond = est_cond_new();
1145     if(sim > 0){
1146     svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL;
1147     if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){
1148     svmap = est_db_etch_doc((g_condopts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM);
1149     est_doc_delete(doc);
1150     }
1151     if(svmap){
1152     pbuf = cbdatumopen(ESTOPSIMILAR, -1);
1153     cbmapiterinit(svmap);
1154     while((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
1155     vbuf = cbmapget(svmap, kbuf, ksiz, &vsiz);
1156     cbdatumcat(pbuf, " WITH ", -1);
1157     cbdatumcat(pbuf, vbuf, vsiz);
1158     cbdatumcat(pbuf, " ", 1);
1159     cbdatumcat(pbuf, kbuf, ksiz);
1160     }
1161     est_cond_set_phrase(cond, cbdatumptr(pbuf));
1162     cbdatumclose(pbuf);
1163     cbmapclose(svmap);
1164     }
1165     } else {
1166     while(*phrase > '\0' && *phrase <= ' '){
1167     phrase++;
1168     }
1169     if(phrase[0] != '\0' || cblistnum(attrs) < 1) est_cond_set_phrase(cond, phrase);
1170     }
1171     for(i = 0; i < cblistnum(attrs); i++){
1172     est_cond_add_attr(cond, cblistval(attrs, i, NULL));
1173     }
1174     if(ord) est_cond_set_order(cond, ord);
1175     if(max >= 0) est_cond_set_max(cond, max);
1176     est_cond_set_options(cond, g_condopts);
1177     hints = cbmapopenex(MINIBNUM);
1178     curtime = est_gettimeofday();
1179     res = est_db_search(db, cond, &rnum, hints);
1180     curtime = est_gettimeofday() - curtime;
1181     if(g_viewmode == VM_XML){
1182     xmlprintf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
1183     xmlprintf("<estresult version=\"%@\">\n", est_version);
1184     xmlprintf("<meta>\n");
1185     xmlprintf("<hit number=\"%@\"/>\n", cbmapget(hints, "", 0, NULL));
1186     cbmapiterinit(hints);
1187     while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1188     if(kbuf[0] == '\0') continue;
1189     vbuf = cbmapget(hints, kbuf, -1, NULL);
1190     xmlprintf("<hit key=\"%@\" number=\"%@\"/>\n", kbuf, vbuf);
1191     }
1192     xmlprintf("<time time=\"%.3f\"/>\n", curtime / 1000.0);
1193     xmlprintf("<total documents=\"%d\" words=\"%d\"/>\n",
1194     est_db_doc_num(db), est_db_word_num(db));
1195     xmlprintf("</meta>\n");
1196     } else {
1197     printf("%s\n", est_border_str());
1198     printf("VERSION\t%s\n", _EST_PROTVER);
1199     printf("NODE\tlocal\n");
1200     printf("HIT\t%s\n", cbmapget(hints, "", 0, NULL));
1201     cbmapiterinit(hints);
1202     cnt = 1;
1203     while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1204     if(kbuf[0] == '\0') continue;
1205     vbuf = cbmapget(hints, kbuf, -1, NULL);
1206     printf("HINT#%d\t%s\t%s\n", cnt, kbuf, vbuf);
1207     cnt++;
1208     }
1209     printf("TIME\t%.3f\n", curtime / 1000.0);
1210     printf("DOCNUM\t%d\n", est_db_doc_num(db));
1211     printf("WORDNUM\t%d\n", est_db_word_num(db));
1212     switch(g_viewmode){
1213     case VM_ID:
1214     printf("VIEW\tID\n");
1215     break;
1216     case VM_URI:
1217     printf("VIEW\tURI\n");
1218     break;
1219     case VM_ATTR:
1220     printf("VIEW\tATTRIBUTE\n");
1221     break;
1222     case VM_FULL:
1223     printf("VIEW\tFULL\n");
1224     break;
1225     case VM_SNIP:
1226     printf("VIEW\tSNIPPET\n");
1227     break;
1228     case VM_HMRD:
1229     printf("VIEW\tHUMAN\n");
1230     break;
1231     }
1232     printf("\n");
1233     if(g_viewmode == VM_ID || g_viewmode == VM_URI ||
1234     g_viewmode == VM_HMRD || g_viewmode == VM_DUMP) printf("%s\n", est_border_str());
1235     }
1236     for(i = 0; i < rnum ; i++){
1237     id = res[i];
1238     sc = est_cond_score(cond, i);
1239     switch(g_viewmode){
1240     case VM_URI:
1241     if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1242     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1243     printf("%d\t%s\n", id, vbuf);
1244     est_doc_delete(doc);
1245     }
1246     break;
1247     case VM_ATTR:
1248     if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1249     if(sc >= 0){
1250     sprintf(numbuf, "%d", sc);
1251     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1252     }
1253     printf("%s\n", est_border_str());
1254     names = est_doc_attr_names(doc);
1255     for(j = 0; j < cblistnum(names); j++){
1256     kbuf = cblistval(names, j, NULL);
1257     vbuf = est_doc_attr(doc, kbuf);
1258     printf("%s=%s\n", kbuf, vbuf);
1259     }
1260     cblistclose(names);
1261     est_doc_delete(doc);
1262     }
1263     printf("\n");
1264     break;
1265     case VM_FULL:
1266     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1267     if(sc >= 0){
1268     sprintf(numbuf, "%d", sc);
1269     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1270     }
1271     printf("%s\n", est_border_str());
1272     draft = est_doc_dump_draft(doc);
1273     printf("%s", draft);
1274     free(draft);
1275     est_doc_delete(doc);
1276     }
1277     break;
1278     case VM_SNIP:
1279     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1280     if(sc >= 0){
1281     sprintf(numbuf, "%d", sc);
1282     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1283     }
1284     printf("%s\n", est_border_str());
1285     names = est_doc_attr_names(doc);
1286     for(j = 0; j < cblistnum(names); j++){
1287     kbuf = cblistval(names, j, NULL);
1288     vbuf = est_doc_attr(doc, kbuf);
1289     printf("%s=%s\n", kbuf, vbuf);
1290     }
1291     cblistclose(names);
1292     kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1293     if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1294     if(cbmaprnum(kwords) > 0){
1295     printf("%s=", DATTRKWORDS);
1296     cbmapiterinit(kwords);
1297     for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1298     if(j > 0) printf(" ");
1299     printf("%s %s", kbuf, cbmapget(kwords, kbuf, -1, NULL));
1300     }
1301     printf("\n");
1302     }
1303     cbmapclose(kwords);
1304     printf("\n");
1305     words = cbmapkeys(hints);
1306     draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1307     printf("%s", draft);
1308     free(draft);
1309     cblistclose(words);
1310     est_doc_delete(doc);
1311     }
1312     break;
1313     case VM_HMRD:
1314     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1315     if(sc >= 0){
1316     sprintf(numbuf, "%d", sc);
1317     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1318     }
1319     printf("\n");
1320     if((vbuf = est_doc_attr(doc, ESTDATTRURI)) != NULL) printf("URI: %s\n", vbuf);
1321     if((vbuf = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) printf("Title: %s\n", vbuf);
1322     printf(" ");
1323     words = cbmapkeys(hints);
1324     draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1325     lines = cbsplit(draft, -1, "\n");
1326     fin = TRUE;
1327     for(j = 0; j < cblistnum(lines); j++){
1328     line = cblistval(lines, j, NULL);
1329     if(line[0] != '\0'){
1330     word = cbmemdup(line, -1);
1331     if((pv = strchr(word, '\t')) != NULL) *pv = '\0';
1332     printf("%s", word);
1333     free(word);
1334     fin = TRUE;
1335     } else if(fin){
1336     printf(" ... ");
1337     fin = FALSE;
1338     }
1339     }
1340     cblistclose(lines);
1341     free(draft);
1342     cblistclose(words);
1343     printf("\n\n");
1344     est_doc_delete(doc);
1345     }
1346     break;
1347     case VM_XML:
1348     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1349     if(sc >= 0){
1350     sprintf(numbuf, "%d", sc);
1351     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1352     }
1353     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1354     xmlprintf("<document id=\"%d\" uri=\"%@\">\n", id, vbuf);
1355     names = est_doc_attr_names(doc);
1356     for(j = 0; j < cblistnum(names); j++){
1357     kbuf = cblistval(names, j, NULL);
1358     if(!strcmp(kbuf, ESTDATTRID) || !strcmp(kbuf, ESTDATTRURI)) continue;
1359     vbuf = est_doc_attr(doc, kbuf);
1360     xmlprintf("<attribute name=\"%@\" value=\"%@\"/>\n", kbuf, vbuf);
1361     }
1362     cblistclose(names);
1363     kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1364     if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1365     if(cbmaprnum(kwords) > 0){
1366     xmlprintf("<vector>");
1367     cbmapiterinit(kwords);
1368     for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1369     xmlprintf("<element key=\"%@\" number=\"%@\"/>",
1370     kbuf, cbmapget(kwords, kbuf, -1, NULL));
1371     }
1372     xmlprintf("</vector>\n");
1373     }
1374     cbmapclose(kwords);
1375     words = cbmapkeys(hints);
1376     draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1377     lines = cbsplit(draft, -1, "\n");
1378     fin = TRUE;
1379     xmlprintf("<snippet>");
1380     for(j = 0; j < cblistnum(lines); j++){
1381     line = cblistval(lines, j, NULL);
1382     if(line[0] != '\0'){
1383     word = cbmemdup(line, -1);
1384     if((pv = strchr(word, '\t')) != NULL){
1385     *pv = '\0';
1386     pv++;
1387     xmlprintf("<key normal=\"%@\">%@</key>", pv, word);
1388     } else {
1389     xmlprintf("%@", word);
1390     }
1391     free(word);
1392     fin = TRUE;
1393     } else if(fin){
1394     xmlprintf("<delimiter/>");
1395     fin = FALSE;
1396     }
1397     }
1398     xmlprintf("</snippet>\n");
1399     cblistclose(lines);
1400     free(draft);
1401     cblistclose(words);
1402     xmlprintf("</document>\n");
1403     est_doc_delete(doc);
1404     }
1405     break;
1406     case VM_DUMP:
1407     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1408     if(sc >= 0){
1409     sprintf(numbuf, "%d", sc);
1410     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1411     }
1412     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1413     sprintf(path, "%08d%cest", id, ESTEXTCHR);
1414     printf("%s\t%s\n", path, vbuf);
1415     draft = est_doc_dump_draft(doc);
1416     if(!(cbwritefile(path, draft, -1))) printferror("%s: could not open", path);
1417     free(draft);
1418     est_doc_delete(doc);
1419     }
1420     break;
1421     default:
1422     printf("%d\n", id);
1423     break;
1424     }
1425     }
1426     if(g_viewmode == VM_XML){
1427     xmlprintf("</estresult>\n");
1428     } else {
1429     printf("%s:END\n", est_border_str());
1430     }
1431     free(res);
1432     cbmapclose(hints);
1433     est_cond_delete(cond);
1434     if(kwdb) crclose(kwdb);
1435     if(!est_db_close(db, &ecode)){
1436     printferror("%s: %s", dbname, est_err_msg(ecode));
1437     return 1;
1438     }
1439     return 0;
1440     }
1441    
1442    
1443     /* perform the gather command */
1444     static int procgather(const char *dbname, const char *filename){
1445     ESTDB *db;
1446     CBLIST *list, *clist;
1447     FILE *ifp;
1448     const char *tmp;
1449     char *line, *path;
1450     int i, err, ecode;
1451     time_t curtime;
1452     struct stat sbuf;
1453     curtime = time(NULL);
1454     err = FALSE;
1455     if(stat(filename, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)){
1456     printfinfo("reading list from the directory: %s", filename);
1457     if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1458     est_db_set_informer(db, dbinform);
1459     if(g_cachesize > 0){
1460     if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1461     est_db_set_cache_size(db, g_cachesize, -1, -1);
1462     }
1463     list = cblistopen();
1464     cblistunshift(list, filename, -1);
1465     while((line = cblistshift(list, NULL)) != NULL){
1466     if(stat(line, &sbuf) != -1 && S_ISDIR(sbuf.st_mode) && (clist = cbdirlist(line)) != NULL){
1467     cblistsort(clist);
1468     for(i = cblistnum(clist) - 1; i >= 0; i--){
1469     tmp = cblistval(clist, i, NULL);
1470     if(!strcmp(tmp, ESTCDIRSTR) || !strcmp(tmp, ESTPDIRSTR)) continue;
1471     path = cbsprintf("%s%c%s", line, ESTPATHCHR, tmp);
1472     cblistunshift(list, path, -1);
1473     free(path);
1474     }
1475     cblistclose(clist);
1476     } else {
1477     if(!doputdoc(db, line)){
1478     printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1479     err = TRUE;
1480     }
1481     }
1482     free(line);
1483     if(err || g_sigterm) break;
1484     }
1485     cblistclose(list);
1486     if(!est_db_close(db, &ecode)){
1487     printferror("%s: %s", dbname, est_err_msg(ecode));
1488     err = TRUE;
1489     }
1490     } else {
1491     printferror("%s: %s", dbname, est_err_msg(ecode));
1492     err = TRUE;
1493     }
1494     } else {
1495     if(!strcmp(filename, "-")){
1496     ifp = stdin;
1497     printfinfo("reading list from the standard input", filename);
1498     } else if((ifp = fopen(filename, "rb")) != NULL){
1499     printfinfo("reading list from the file: %s", filename);
1500     } else {
1501     printferror("%s: could not open", filename);
1502     return 1;
1503     }
1504     if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1505     est_db_set_informer(db, dbinform);
1506     if(g_cachesize > 0){
1507     if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1508     est_db_set_cache_size(db, g_cachesize, -1, -1);
1509     }
1510     while((line = fgetl(ifp)) != NULL){
1511     if(!doputdoc(db, line)){
1512     printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1513     err = TRUE;
1514     }
1515     free(line);
1516     if(err || g_sigterm) break;
1517     }
1518     if(!est_db_close(db, &ecode)){
1519     printferror("%s: %s", dbname, est_err_msg(ecode));
1520     err = TRUE;
1521     }
1522     } else {
1523     printferror("%s: %s", dbname, est_err_msg(ecode));
1524     err = TRUE;
1525     }
1526     if(ifp != stdin) fclose(ifp);
1527     }
1528     curtime = time(NULL) - curtime;
1529     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1530     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1531     return err ? 1 : 0;
1532     }
1533    
1534    
1535     /* perform the purge command */
1536     static int procpurge(const char *dbname, const char *prefix){
1537     ESTDB *db;
1538     ESTCOND *cond;
1539     ESTDOC *doc;
1540     const char *luri;
1541     char *attr, *path;
1542     int i, ecode, err, *res, rnum;
1543     time_t curtime;
1544     struct stat sbuf;
1545     curtime = time(NULL);
1546     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1547     printferror("%s: %s", dbname, est_err_msg(ecode));
1548     return 1;
1549     }
1550     est_db_set_informer(db, dbinform);
1551     cond = est_cond_new();
1552     attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1553     est_cond_add_attr(cond, attr);
1554     res = est_db_search(db, cond, &rnum, NULL);
1555     err = FALSE;
1556     for(i = 0; i < rnum; i++){
1557     if(!(doc = est_db_get_doc(db, res[i], ESTGDNOTEXT))) continue;
1558     if((luri = est_doc_attr(doc, DATTRLPATH)) != NULL){
1559     if(g_doforce){
1560     if(est_db_out_doc(db, res[i], g_outopts)){
1561     printfinfo("%d (%s): deleted", res[i], luri);
1562     } else {
1563     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1564     err = TRUE;
1565     }
1566     } else if((path = urltopath(luri)) != NULL){
1567     if(stat(path, &sbuf) != -1){
1568     printfinfo("%s: passed", luri);
1569     } else {
1570     if(est_db_out_doc(db, res[i], g_outopts)){
1571     printfinfo("%d (%s): deleted", res[i], luri);
1572     } else {
1573     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1574     err = TRUE;
1575     }
1576     }
1577     } else {
1578     printfinfo("%s: ignored", luri);
1579     }
1580     } else {
1581     printfinfo("(%d): ignored", res[i]);
1582     }
1583     est_doc_delete(doc);
1584     if(err || g_sigterm) break;
1585     }
1586     free(res);
1587     est_cond_delete(cond);
1588     free(attr);
1589     if(!est_db_close(db, &ecode)){
1590     printferror("%s: %s", dbname, est_err_msg(ecode));
1591     return 1;
1592     }
1593     curtime = time(NULL) - curtime;
1594     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1595     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1596     return err ? 1 : 0;
1597     }
1598    
1599    
1600     /* perform the extkeys command */
1601     static int procextkeys(const char *dbname, const char *prefix, int ni){
1602     ESTDB *db;
1603     ESTCOND *cond;
1604     ESTDOC *doc;
1605     CURIA *kwdb;
1606     CBMAP *kwords;
1607     const char *uri;
1608     char path[URIBUFSIZ], *attr, *mbuf;
1609     int i, ecode, err, *res, rnum, msiz;
1610     time_t curtime;
1611     curtime = time(NULL);
1612     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1613     printferror("%s: %s", dbname, est_err_msg(ecode));
1614     return 1;
1615     }
1616     est_db_set_informer(db, dbinform);
1617     if(!ni && (!prefix || prefix[0] == '\0')) est_db_fill_key_cache(db);
1618     sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1619     if(!(kwdb = cropen(path, CR_OWRITER | CR_OCREAT, KWDBBNUM, KWDBDNUM))){
1620     printferror("%s: the keyword database has some errors", dbname);
1621     est_db_close(db, &ecode);
1622     return 1;
1623     }
1624     crsetalign(kwdb, -4);
1625     cond = est_cond_new();
1626     attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1627     est_cond_add_attr(cond, attr);
1628     res = est_db_search(db, cond, &rnum, NULL);
1629     err = FALSE;
1630     for(i = 0; i < rnum; i++){
1631     if(!g_doforce && crvsiz(kwdb, (char *)&(res[i]), sizeof(int)) > 0){
1632     printfinfo("%d: passed", res[i]);
1633     continue;
1634     }
1635     if(!(doc = est_db_get_doc(db, res[i], 0))) continue;
1636     if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
1637     kwords = est_db_etch_doc(ni ? NULL : db, doc, g_kwordnum);
1638     mbuf = cbmapdump(kwords, &msiz);
1639     fflush(stdout);
1640     if(crput(kwdb, (char *)&(res[i]), sizeof(int), mbuf, msiz, CR_DOVER)){
1641     printfinfo("%d (%s): extracted", res[i], uri);
1642     } else {
1643     printferror("%s: the keyword database has some errors", dbname);
1644     err = TRUE;
1645     }
1646     free(mbuf);
1647     cbmapclose(kwords);
1648     est_doc_delete(doc);
1649     if(err || g_sigterm) break;
1650     }
1651     free(res);
1652     est_cond_delete(cond);
1653     free(attr);
1654     if(!crclose(kwdb)){
1655     printferror("%s: the keyword database has some errors", dbname);
1656     err = TRUE;
1657     }
1658     if(!est_db_close(db, &ecode)){
1659     printferror("%s: %s", dbname, est_err_msg(ecode));
1660     return 1;
1661     }
1662     curtime = time(NULL) - curtime;
1663     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1664     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1665     return err ? 1 : 0;
1666     }
1667    
1668    
1669     /* perform the draft command */
1670     static int procdraft(const char *filename){
1671     ESTDOC *doc;
1672     char *buf, *draft;
1673     int size;
1674     if(!(buf = cbreadfile(filename, &size))){
1675     printferror("%s: could not open", filename ? filename : "(stdin)");
1676     return 1;
1677     }
1678     switch(g_filefmt){
1679     case FF_TEXT:
1680     doc = est_doc_new_from_text(buf, size, g_inputcode, g_inputlang);
1681     break;
1682     case FF_HTML:
1683     doc = est_doc_new_from_html(buf, size, g_inputcode, g_inputlang);
1684     break;
1685     case FF_MIME:
1686     doc = est_doc_new_from_mime(buf, size, g_inputcode, g_inputlang);
1687     break;
1688     default:
1689     doc = est_doc_new_from_draft_enc(buf, size, g_inputcode);
1690     break;
1691     }
1692     draft = est_doc_dump_draft(doc);
1693     printf("%s", draft);
1694     free(draft);
1695     est_doc_delete(doc);
1696     free(buf);
1697     return 0;
1698     }
1699    
1700    
1701     /* perform the break command */
1702     static int procbreak(const char *filename, int wt){
1703     CBLIST *words;
1704     char *str, *phrase;
1705     int i;
1706     if(filename && filename[0] == '@'){
1707     str = cbmemdup(filename + 1, -1);
1708     } else if(!(str = cbreadfile(filename, NULL))){
1709     printferror("%s: could not open", filename ? filename : "(stdin)");
1710     return 1;
1711     }
1712     if(!(phrase = est_iconv(str, -1, g_inputcode, "UTF-8", NULL, NULL))){
1713     printferror("%s: unsupported encoding\n", g_inputcode);
1714     free(str);
1715     return 1;
1716     }
1717     g_inputcode = NULL;
1718     words = cblistopen();
1719     if(g_oextmodes & ESTDBPERFNG){
1720     est_break_text_perfng(phrase, words, TRUE, wt);
1721     } else {
1722     est_break_text(phrase, words, TRUE, wt);
1723     }
1724     for(i = 0; i < cblistnum(words); i++){
1725     printf("%s\n", cblistval(words, i, NULL));
1726     }
1727     cblistclose(words);
1728     free(phrase);
1729     free(str);
1730     return 0;
1731     }
1732    
1733    
1734     /* perform the randput command */
1735     static int procrandput(const char *dbname, int dnum){
1736     ESTDB *db;
1737     ESTDOC *doc;
1738     const char *mode;
1739     char uri[URIBUFSIZ];
1740     int i, ecode, err;
1741     time_t curtime;
1742     curtime = time(NULL);
1743     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1744     printferror("%s: %s", dbname, est_err_msg(ecode));
1745     return 1;
1746     }
1747     est_db_set_informer(db, dbinform);
1748     if(g_cachesize > 0){
1749     if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1750     est_db_set_cache_size(db, g_cachesize, -1, -1);
1751     }
1752     err = FALSE;
1753     for(i = 0; i < dnum; i++){
1754     doc = est_doc_new_from_chaos(RDOCCNUM, RDOCSNUM, g_rdmode);
1755     sprintf(uri, "file:///tmp/randput-%08d-%05d.est", i + 1, getpid());
1756     est_doc_add_attr(doc, ESTDATTRURI, uri);
1757     if(est_db_put_doc(db, doc, 0)){
1758     if(!(mode = est_doc_attr(doc, "mode"))) mode = "unknown";
1759     printfinfo("%d (%s) (%s): registered", est_doc_id(doc), uri, mode);
1760     } else {
1761     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1762     err = TRUE;
1763     }
1764     est_doc_delete(doc);
1765     if(err || g_sigterm) break;
1766     }
1767     if(!est_db_close(db, &ecode)){
1768     printferror("%s: %s", dbname, est_err_msg(ecode));
1769     return 1;
1770     }
1771     curtime = time(NULL) - curtime;
1772     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1773     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1774     return err ? 1 : 0;
1775     }
1776    
1777    
1778     /* perform the wicked command */
1779     static int procwicked(const char *dbname, int dnum){
1780     ESTDB *db;
1781     ESTDOC *doc;
1782     ESTCOND *cond;
1783     CBLIST *words;
1784     char uri[URIBUFSIZ], *oper, *value, *first, *second, *phrase;
1785     int i, j, ecode, err, *res, rnum;
1786     double rnd;
1787     time_t curtime;
1788     curtime = time(NULL);
1789     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1790     printferror("%s: %s", dbname, est_err_msg(ecode));
1791     return 1;
1792     }
1793     est_db_set_informer(db, dbinform);
1794     est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1795     est_db_set_special_cache(db, ESTDATTRURI, 128);
1796     err = FALSE;
1797     for(i = 0; i < dnum; i++){
1798     rnd = est_random();
1799     if((int)(rnd * INT_MAX) % dnum < 5){
1800     rnd = est_random();
1801     if(rnd < 0.3){
1802     if(!est_db_close(db, &ecode)){
1803     printferror("%s: %s", dbname, est_err_msg(ecode));
1804     return 1;
1805     }
1806     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1807     printferror("%s: %s", dbname, est_err_msg(ecode));
1808     return 1;
1809     }
1810     est_db_set_informer(db, dbinform);
1811     est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1812     est_db_set_special_cache(db, ESTDATTRURI, i / 10 + 1);
1813     } else if(rnd < 0.5){
1814     if(!est_db_optimize(db, (int)(est_random() * INT_MAX) % 2 == 0) ? ESTOPTNOPURGE : 0)
1815     err = TRUE;
1816     } else if(rnd < 0.8){
1817     if(!est_db_flush(db, 1024)) err = TRUE;
1818     } else {
1819     if(!est_db_sync(db)) err = TRUE;
1820     }
1821     } else if(rnd < 0.05){
1822     if(est_db_out_doc(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1823     ((int)(est_random() * INT_MAX) % 2 == 0) ? ESTODCLEAN : 0)){
1824     printfinfo("[%d:%d]: out", i + 1, est_db_doc_num(db));
1825     } else if(est_db_error(db) != ESTENOITEM){
1826     err = TRUE;
1827     }
1828     } else if(rnd < 0.1){
1829     if((value = est_db_get_doc_attr(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1830     ESTDATTRURI)) != NULL){
1831     printfinfo("[%d:%d]: attr: %s", i + 1, est_db_doc_num(db), value);
1832     free(value);
1833     }
1834     } else if(rnd < 0.25){
1835     rnd = est_random();
1836     if(rnd < 0.5){
1837     oper = " OR ";
1838     } else if(rnd < 0.7){
1839     oper = " AND ";
1840     } else if(rnd < 0.8){
1841     oper = " NOTAND ";
1842     } else if(rnd < 0.9){
1843     oper = " ";
1844     } else {
1845     oper = "";
1846     }
1847     first = est_random_str(5, (int)(est_random() * INT_MAX) % RD_RAND);
1848     second = est_random_str(2, (int)(est_random() * INT_MAX) % RD_RAND);
1849     phrase = cbsprintf("%s%s%s", first, oper, second);
1850     cond = est_cond_new();
1851     est_cond_set_phrase(cond, phrase);
1852     if(est_random() < 0.25) est_cond_add_attr(cond, "@uri STREW 0.est");
1853     if(est_random() < 0.25) est_cond_set_order(cond, "@uri STRD");
1854     if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDSURE | ESTCONDSCFB);
1855     if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDAGIT | ESTCONDNOIDF);
1856     res = est_db_search(db, cond, &rnum, NULL);
1857     printfinfo("[%d:%d]: search: %d hits", i + 1, est_db_doc_num(db), rnum);
1858     if(est_random() < 0.05){
1859     for(j = 0; j < rnum && j < 100; j++){
1860     if((doc = est_db_get_doc(db, res[j], 0)) != NULL){
1861     if(i % 10 == 0){
1862     free(est_doc_cat_texts(doc));
1863     free(est_doc_dump_draft(doc));
1864     words = cblistopen();
1865     cblistpush(words, "vw", -1);
1866     cblistpush(words, "xy", -1);
1867     cblistpush(words, "z", -1);
1868     free(est_doc_make_snippet(doc, words, 100, 10, 10));
1869     cblistclose(words);
1870     }
1871     est_doc_delete(doc);
1872     } else if(est_db_error(db) != ESTENOITEM){
1873     err = TRUE;
1874     }
1875     }
1876     }
1877     free(res);
1878     est_cond_delete(cond);
1879     free(phrase);
1880     free(first);
1881     free(second);
1882     } else {
1883     doc = est_doc_new_from_chaos(100, 3, est_random() < 0.5 ? RD_EURO : RD_RAND);
1884     if(est_random() < 0.2){
1885     sprintf(uri, "file:///tmp/wicked-%08d-%05d.est",
1886     (int)(est_random() * INT_MAX) % (i + 1) + 1, getpid());
1887     } else {
1888     sprintf(uri, "file:///tmp/wicked-%08d-%05d.est", i + 1, getpid());
1889     }
1890     est_doc_add_attr(doc, ESTDATTRURI, uri);
1891     if(!est_db_put_doc(db, doc, est_random() < 0.5 ? ESTPDCLEAN : 0)) err = TRUE;
1892     est_doc_delete(doc);
1893     }
1894     if(err || g_sigterm) break;
1895     }
1896     if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1897     if(!est_db_close(db, &ecode)){
1898     printferror("%s: %s", dbname, est_err_msg(ecode));
1899     return 1;
1900     }
1901     curtime = time(NULL) - curtime;
1902     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1903     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1904     return err ? 1 : 0;
1905     }
1906    
1907    
1908     /* perform the regression command */
1909     static int procregression(const char *dbname){
1910     ESTDB *db;
1911     ESTDOC *doc;
1912     ESTCOND *cond;
1913     int i, ecode, err, *res, rnum;
1914     time_t curtime;
1915     curtime = time(NULL);
1916     printfinfo("# opening the database");
1917     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1918     printferror("%s: %s", dbname, est_err_msg(ecode));
1919     return 1;
1920     }
1921     est_db_set_informer(db, dbinform);
1922     err = FALSE;
1923     if(!err){
1924     printfinfo("# checking registration of small documents");
1925     doc = est_doc_new();
1926     est_doc_add_attr(doc, ESTDATTRURI, "file:///small/one");
1927     est_doc_add_text(doc, "One!");
1928     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1929     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1930     est_doc_delete(doc);
1931     doc = est_doc_new();
1932     est_doc_add_attr(doc, ESTDATTRURI, "file:///small/two");
1933     est_doc_add_text(doc, "Two!!");
1934     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1935     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1936     est_doc_delete(doc);
1937     doc = est_doc_new();
1938     est_doc_add_attr(doc, ESTDATTRURI, "file:///small/three");
1939     est_doc_add_text(doc, "Three!!!");
1940     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1941     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1942     est_doc_delete(doc);
1943     doc = est_doc_new();
1944     est_doc_add_attr(doc, ESTDATTRURI, "file:///empty");
1945     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1946     est_doc_delete(doc);
1947     }
1948     if(!err){
1949     printfinfo("# checking registration of an english document");
1950     doc = est_doc_new();
1951     est_doc_add_attr(doc, ESTDATTRURI, "file:///english");
1952     est_doc_add_attr(doc, ESTDATTRTITLE, "Hyper Estraier");
1953     est_doc_add_text(doc, "% This is a displayed sentence. ;-)");
1954     est_doc_add_text(doc, "Hyper Estraier is a full-text search system for communities.");
1955     est_doc_add_text(doc, "A little suffering is good for the soul.");
1956     est_doc_add_text(doc, "They have been at a great feast of languages, and stolen the scraps.");
1957     est_doc_add_hidden_text(doc, "(Give it up, Yo! Give it up, Yo!)");
1958     est_doc_add_hidden_text(doc, "% This is a hidden sentence. :-<");
1959     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1960     est_doc_add_hidden_text(doc, "");
1961     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1962     est_doc_delete(doc);
1963     }
1964     if(!err){
1965     printfinfo("# checking registration of a japanese document");
1966     doc = est_doc_new();
1967     est_doc_add_attr(doc, ESTDATTRURI, "file:///japanese");
1968     est_doc_add_attr(doc, ESTDATTRTITLE, "\xe5\xb9\xb3\xe6\x9e\x97\xe5\xb9\xb9\xe9\x9b\x84");
1969     est_doc_add_text(doc, "\xe6\x9c\xac\xe6\x97\xa5\xe3\x81\xaf\xe6\x99\xb4\xe5\xa4\xa9\xe3"
1970     "\x81\xaa\xe3\x82\x8a\xe3\x80\x82");
1971     est_doc_add_text(doc, "\xe6\x9c\x95\xe3\x81\xaf\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4"
1972     "\xb8\x80\xe3\x81\xae\xe4\xb8\x8b\xe5\x83\x95\xe3\x81\xa7\xe3\x81"
1973     "\x82\xe3\x82\x8b\xe3\x80\x82");
1974     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1975     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1976     est_doc_delete(doc);
1977     }
1978     if(!err){
1979     printfinfo("# checking duplication of documents");
1980     doc = est_doc_new();
1981     est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
1982     est_doc_add_text(doc, "Gamble, you gatta chance to make a Rumble!");
1983     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1984     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1985     est_doc_delete(doc);
1986     doc = est_doc_new();
1987     est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
1988     est_doc_add_text(doc, "bring back hey, one more time!");
1989     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1990     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1991     est_doc_delete(doc);
1992     if(est_db_doc_num(db) != 7){
1993     printferror("%s: the number of documents is invalid", dbname);
1994     err = TRUE;
1995     }
1996     }
1997     if(!err){
1998     printfinfo("# checking search for unfixed documents");
1999     cond = est_cond_new();
2000     est_cond_set_phrase(cond, "check");
2001     res = est_db_search(db, cond, &rnum, NULL);
2002     if(rnum != 6){
2003     printferror("%s: the number of result is invalid", dbname);
2004     err = TRUE;
2005     }
2006     free(res);
2007     est_cond_delete(cond);
2008     }
2009     if(!err){
2010     printfinfo("# checking partial flushing of the index");
2011     if(!est_db_flush(db, 32)) err = TRUE;
2012     }
2013     if(!err){
2014     printfinfo("# checking deletion with cleaning of a document");
2015     if(!est_db_out_doc(db, 1, ESTODCLEAN)) err = TRUE;
2016     }
2017     if(!err){
2018     printfinfo("# checking synchronization");
2019     if(!est_db_sync(db)) err = TRUE;
2020     }
2021     if(!err){
2022     printfinfo("# checking deletion without cleaning of a document");
2023     if(!est_db_out_doc(db, 2, 0)) err = TRUE;
2024     }
2025     if(!err){
2026     printfinfo("# checking word search");
2027     cond = est_cond_new();
2028     est_cond_set_phrase(cond, "check it AND on");
2029     res = est_db_search(db, cond, &rnum, NULL);
2030     if(rnum != 5){
2031     printferror("%s: the number of result is invalid", dbname);
2032     err = TRUE;
2033     }
2034     free(res);
2035     est_cond_set_phrase(cond, "RUMBLE OR \xe3\x80\x82");
2036     res = est_db_search(db, cond, &rnum, NULL);
2037     if(rnum != 1){
2038     printferror("%s: the number of result is invalid", dbname);
2039     err = TRUE;
2040     }
2041     free(res);
2042     est_cond_delete(cond);
2043     }
2044     if(!err){
2045     printfinfo("# checking attribute search");
2046     cond = est_cond_new();
2047     est_cond_add_attr(cond, "@uri !ISTRINC SMaLl");
2048     res = est_db_search(db, cond, &rnum, NULL);
2049     if(rnum != est_db_doc_num(db) - 1){
2050     printferror("%s: the number of result is invalid", dbname);
2051     err = TRUE;
2052     }
2053     free(res);
2054     est_cond_delete(cond);
2055     cond = est_cond_new();
2056     est_cond_add_attr(cond, "@uri STRBW file://");
2057     est_cond_add_attr(cond, "@title STRINC \xe5\xb9\xb3");
2058     res = est_db_search(db, cond, &rnum, NULL);
2059     if(rnum != 1){
2060     printferror("%s: the number of result is invalid", dbname);
2061     err = TRUE;
2062     }
2063     free(res);
2064     est_cond_delete(cond);
2065     }
2066     if(!err){
2067     printfinfo("# checking combined search");
2068     cond = est_cond_new();
2069     est_cond_set_phrase(cond, "\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4\xb8\x80");
2070     est_cond_add_attr(cond, "@uri");
2071     est_cond_set_order(cond, "@title");
2072     res = est_db_search(db, cond, &rnum, NULL);
2073     if(rnum != 1){
2074     printferror("%s: the number of result is invalid", dbname);
2075     err = TRUE;
2076     }
2077     free(res);
2078     est_cond_delete(cond);
2079     cond = est_cond_new();
2080     est_cond_set_phrase(cond, "one | \xe3\x80\x82 | check & check it ! hogehoge");
2081     est_cond_add_attr(cond, "@uri STRBW file://");
2082     est_cond_set_order(cond, "@title STRD");
2083     est_cond_set_options(cond, ESTCONDSURE | ESTCONDNOIDF | ESTCONDSIMPLE);
2084     res = est_db_search(db, cond, &rnum, NULL);
2085     if(rnum != 4){
2086     printferror("%s: the number of result is invalid", dbname);
2087     err = TRUE;
2088     }
2089     free(res);
2090     est_cond_delete(cond);
2091     }
2092     if(!err){
2093     printfinfo("# checking optimization");
2094     if(!est_db_optimize(db, 0)) err = TRUE;
2095     cond = est_cond_new();
2096     est_cond_set_phrase(cond, "check");
2097     res = est_db_search(db, cond, &rnum, NULL);
2098     if(rnum != 4){
2099     printferror("%s: the number of result is invalid", dbname);
2100     err = TRUE;
2101     }
2102     free(res);
2103     est_cond_delete(cond);
2104     }
2105     if(!err){
2106     printfinfo("# checking traversal access");
2107     cond = est_cond_new();
2108     est_cond_set_phrase(cond, "[UVSET]");
2109     res = est_db_search(db, cond, &rnum, NULL);
2110     for(i = 0; i < rnum; i++){
2111     if(!(doc = est_db_get_doc(db, res[i], 0))){
2112     printferror("%s: a document cannot be retrieved", dbname);
2113     err = TRUE;
2114     break;
2115     }
2116     est_doc_delete(doc);
2117     }
2118     free(res);
2119     est_cond_delete(cond);
2120     }
2121     if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
2122     printfinfo("# closing the database");
2123     if(!est_db_close(db, &ecode)){
2124     printferror("%s: %s", dbname, est_err_msg(ecode));
2125     return 1;
2126     }
2127     curtime = time(NULL) - curtime;
2128     if(!err) printfinfo("# finished successfully: elapsed time: %dh %dm %ds",
2129     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
2130     return err ? 1 : 0;
2131     }
2132    
2133    
2134     /* output escaped string */
2135     static void xmlprintf(const char *format, ...){
2136     va_list ap;
2137     char *tmp, cbuf[32];
2138     unsigned char c;
2139     int cblen;
2140     va_start(ap, format);
2141     while(*format != '\0'){
2142     if(*format == '%'){
2143     cbuf[0] = '%';
2144     cblen = 1;
2145     format++;
2146     while(strchr("0123456789 .+-", *format) && *format != '\0' && cblen < 31){
2147     cbuf[cblen++] = *format;
2148     format++;
2149     }
2150     cbuf[cblen++] = *format;
2151     cbuf[cblen] = '\0';
2152     switch(*format){
2153     case 's':
2154     tmp = va_arg(ap, char *);
2155     if(!tmp) tmp = "(null)";
2156     printf(cbuf, tmp);
2157     break;
2158     case 'd':
2159     printf(cbuf, va_arg(ap, int));
2160     break;
2161     case 'o': case 'u': case 'x': case 'X': case 'c':
2162     printf(cbuf, va_arg(ap, unsigned int));
2163     break;
2164     case 'e': case 'E': case 'f': case 'g': case 'G':
2165     printf(cbuf, va_arg(ap, double));
2166     break;
2167     case '@':
2168     tmp = va_arg(ap, char *);
2169     if(!tmp) tmp = "(null)";
2170     while(*tmp){
2171     switch(*tmp){
2172     case '&': printf("&amp;"); break;
2173     case '<': printf("&lt;"); break;
2174     case '>': printf("&gt;"); break;
2175     case '"': printf("&quot;"); break;
2176     default:
2177     if(!((*tmp >= 0 && *tmp <= 0x8) || (*tmp >= 0x0e && *tmp <= 0x1f))) putchar(*tmp);
2178     break;
2179     }
2180     tmp++;
2181     }
2182     break;
2183     case '?':
2184     tmp = va_arg(ap, char *);
2185     if(!tmp) tmp = "(null)";
2186     while(*tmp){
2187     c = *(unsigned char *)tmp;
2188     if((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
2189     (c >= '0' && c <= '9') || (c != '\0' && strchr("_-.", c))){
2190     putchar(c);
2191     } else {
2192     printf("%%%02X", c);
2193     }
2194     tmp++;
2195     }
2196     break;
2197     case '%':
2198     putchar('%');
2199     break;
2200     }
2201     } else {
2202     putchar(*format);
2203     }
2204     format++;
2205     }
2206     va_end(ap);
2207     }
2208    
2209    
2210     /* get the language value */
2211     static int strtolang(const char *str){
2212     if(!cbstricmp(str, "en")) return ESTLANGEN;
2213     if(!cbstricmp(str, "ja")) return ESTLANGJA;
2214     if(!cbstricmp(str, "zh")) return ESTLANGZH;
2215     if(!cbstricmp(str, "ko")) return ESTLANGKO;
2216     return ESTLANGMISC;
2217     }
2218    
2219    
2220     /* read a line */
2221     static char *fgetl(FILE *ifp){
2222     char *buf;
2223     int c, len, blen;
2224     buf = NULL;
2225     len = 0;
2226     blen = 1024;
2227     while((c = fgetc(ifp)) != EOF){
2228     if(blen <= len) blen *= 2;
2229     buf = cbrealloc(buf, blen + 1);
2230     if(c == '\n') c = '\0';
2231     if(c != '\r') buf[len++] = c;
2232     if(c == '\0') break;
2233     }
2234     if(!buf) return NULL;
2235     buf[len] = '\0';
2236     return buf;
2237     }
2238    
2239    
2240     /* register a document */
2241     static int doputdoc(ESTDB *db, const char *path){
2242     ESTDOC *doc, *edoc;
2243     const char *uri, *vbuf, *xcmd;
2244     char *dbuf, *tbuf;
2245     int err, fmt, id, dsiz;
2246     time_t emdate, fmdate;
2247     struct stat sbuf;
2248     xcmd = NULL;
2249     if(cbmaprnum(g_xcmdmap) > 0){
2250     cbmapiterinit(g_xcmdmap);
2251     while((vbuf = cbmapiternext(g_xcmdmap, NULL)) != NULL){
2252     if(cbstrbwimatch(path, vbuf)){
2253     xcmd = cbmapget(g_xcmdmap, vbuf, -1, NULL);
2254     break;
2255     }
2256     }
2257     }
2258     fmt = g_filefmt;
2259     if(g_filefmt == FF_NONE && !xcmd) return TRUE;
2260     if(g_filefmt == FF_AUTO){
2261     if(cbstrbwimatch(path, ESTEXTSTR "est")){
2262     fmt = FF_DRAFT;
2263     } else if(cbstrbwimatch(path, ESTEXTSTR "txt") || cbstrbwimatch(path, ESTEXTSTR "text") ||
2264     cbstrbwimatch(path, ESTEXTSTR "asc")){
2265     fmt = FF_TEXT;
2266     } else if(cbstrbwimatch(path, ESTEXTSTR "html") || cbstrbwimatch(path, ESTEXTSTR "htm") ||
2267     cbstrbwimatch(path, ESTEXTSTR "xhtml") || cbstrbwimatch(path, ESTEXTSTR "xht")){
2268     fmt = FF_HTML;
2269     } else if(cbstrbwimatch(path, ESTEXTSTR "eml") || cbstrbwimatch(path, ESTEXTSTR "mime") ||
2270     cbstrbwimatch(path, ESTEXTSTR "mht") || cbstrbwimatch(path, ESTEXTSTR "mhtml")){
2271     fmt = FF_MIME;
2272     } else if(!xcmd){
2273     return TRUE;
2274     }
2275     }
2276     if(stat(path, &sbuf) == -1 || !S_ISREG(sbuf.st_mode) || !(uri = pathtourl(path))){
2277     printferror("%s: could not open", path);
2278     return TRUE;
2279     }
2280     emdate = -1;
2281     if(g_chkmdate && (id = est_db_uri_to_id(db, uri)) > 0 &&
2282     (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2283     if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2284     est_doc_delete(edoc);
2285     }
2286     if(g_stdate && emdate >= 0 && emdate >= sbuf.st_mtime){
2287     printfinfo("%s: passed", path);
2288     return TRUE;
2289     }
2290     if(g_filtorig){
2291     dbuf = cbmemdup("", 0);
2292     dsiz = 0;
2293     } else {
2294     if(!(dbuf = cbreadfile(path, &dsiz))){
2295     printferror("%s: could not open", path);
2296     return TRUE;
2297     }
2298     }
2299     if(xcmd){
2300     doc = est_doc_new_with_xcmd(dbuf, dsiz, path, xcmd, est_db_name(db),
2301     g_inputcode, g_inputlang);
2302     } else {
2303     switch(fmt){
2304     case FF_TEXT:
2305     doc = est_doc_new_from_text(dbuf, dsiz, g_inputcode, g_inputlang);
2306     break;
2307     case FF_HTML:
2308     doc = est_doc_new_from_html(dbuf, dsiz, g_inputcode, g_inputlang);
2309     break;
2310     case FF_MIME:
2311     doc = est_doc_new_from_mime(dbuf, dsiz, g_inputcode, g_inputlang);
2312     break;
2313     default:
2314     doc = est_doc_new_from_draft_enc(dbuf, dsiz, g_inputcode);
2315     break;
2316     }
2317     }
2318     if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri);
2319     est_doc_add_attr(doc, DATTRLPATH, uri);
2320     est_doc_add_attr(doc, DATTRLFILE, urltofile(uri));
2321     uri = est_doc_attr(doc, ESTDATTRURI);
2322     if(g_stdate){
2323     tbuf = cbdatestrwww(sbuf.st_ctime, 0);
2324     est_doc_add_attr(doc, ESTDATTRCDATE, tbuf);
2325     free(tbuf);
2326     tbuf = cbdatestrwww(sbuf.st_mtime, 0);
2327     est_doc_add_attr(doc, ESTDATTRMDATE, tbuf);
2328     free(tbuf);
2329     }
2330     if(g_chkmdate && emdate == -1 && (id = est_db_uri_to_id(db, uri)) > 0 &&
2331     (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2332     if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2333     est_doc_delete(edoc);
2334     }
2335     fmdate = -1;
2336     if(g_chkmdate && (vbuf = est_doc_attr(doc, ESTDATTRMDATE)) != NULL) fmdate = cbstrmktime(vbuf);
2337     err = FALSE;
2338     if(emdate >= 0 && emdate >= fmdate){
2339     printfinfo("%s: passed", path);
2340     } else if(est_db_put_doc(db, doc, g_putopts)){
2341     printfinfo("%d (%s): registered", est_doc_id(doc), uri);
2342     } else {
2343     printferror("%s: %s", est_db_name(db), est_err_msg(est_db_error(db)));
2344     err = TRUE;
2345     }
2346     est_doc_delete(doc);
2347     free(dbuf);
2348     return err ? FALSE : TRUE;
2349     }
2350    
2351    
2352     /* get the URL of a path */
2353     static const char *pathtourl(const char *path){
2354     static char pbuf[URIBUFSIZ];
2355     const char *elem;
2356     char *wp, *ebuf;
2357     CBLIST *list;
2358     int i, esiz;
2359     if(strlen(path) >= URIBUFSIZ / 4) return NULL;
2360     if(g_pathcode){
2361     wp = est_realpath(path);
2362     if(!(ebuf = est_iconv(wp, -1, g_pathcode, "UTF-8", &esiz, NULL))){
2363     esiz = strlen(wp);
2364     ebuf = cbmemdup(wp, esiz);
2365     }
2366     list = cbsplit(ebuf, esiz, ESTPATHSTR);
2367     free(ebuf);
2368     free(wp);
2369     for(i = 0; i < cblistnum(list); i++){
2370     elem = cblistval(list, i, &esiz);
2371     if((ebuf = est_iconv(elem, esiz, "UTF-8", g_pathcode, &esiz, NULL)) != NULL){
2372     cblistover(list, i, ebuf, esiz);
2373     free(ebuf);
2374     }
2375     }
2376     } else {
2377     wp = est_realpath(path);
2378     list = cbsplit(wp, -1, ESTPATHSTR);
2379     free(wp);
2380     }
2381     wp = pbuf;
2382     wp += sprintf(wp, "file://");
2383     for(i = 0; i < cblistnum(list); i++){
2384     elem = cblistval(list, i, NULL);
2385     if(elem[0] == '\0') continue;
2386     if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2387     elem[1] == ':'){
2388     wp += sprintf(wp, "%c|", elem[0]);
2389     continue;
2390     }
2391     ebuf = cburlencode(elem, -1);
2392     wp += sprintf(wp, "/%s", ebuf);
2393     free(ebuf);
2394     }
2395     *wp = '\0';
2396     cblistclose(list);
2397     return pbuf;
2398     }
2399    
2400    
2401     /* get the file name of a URL */
2402     static const char *urltofile(const char *uri){
2403     static char pbuf[URIBUFSIZ];
2404     const char *rp;
2405     char *dbuf, *ebuf;
2406     int dsiz;
2407     if(g_pathfull){
2408     if((rp = strstr(uri, "//")) != NULL){
2409     rp += 2;
2410     if(((rp[0] >= 'A' && rp[0] <= 'Z') || (rp[0] >= 'a' && rp[0] <= 'z')) &&
2411     rp[1] == '|' && rp[2] == '/') rp += 2;
2412     } else {
2413     rp = uri;
2414     }
2415     } else if((rp = strrchr(uri, '/')) != NULL){
2416     rp++;
2417     } else {
2418     rp = uri;
2419     }
2420     dbuf = cburldecode(rp, &dsiz);
2421     if((ebuf = est_iconv(dbuf, dsiz, g_pathcode ? g_pathcode : "ISO-8859-1", "UTF-8", NULL, NULL))
2422     != NULL){
2423     sprintf(pbuf, "%s", ebuf);
2424     free(ebuf);
2425     } else {
2426     sprintf(pbuf, "%s", rp);
2427     }
2428     free(dbuf);
2429     return pbuf;
2430     }
2431    
2432    
2433     /* geth the local path of a URL */
2434     static char *urltopath(const char *uri){
2435     static char pbuf[URIBUFSIZ];
2436     const char *elem;
2437     char *wp, *dbuf;
2438     CBLIST *list;
2439     int i;
2440     if(!cbstrfwimatch(uri, "file://")) return NULL;
2441     if(!(uri = strchr(uri + 7, '/'))) return NULL;
2442     list = cbsplit(uri, -1, "/");
2443     wp = pbuf;
2444     for(i = 0; i < cblistnum(list); i++){
2445     elem = cblistval(list, i, NULL);
2446     if(elem[0] == '\0') continue;
2447     if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2448     elem[1] == '|'){
2449     wp += sprintf(wp, "%c:", elem[0]);
2450     continue;
2451     }
2452     dbuf = cburldecode(elem, NULL);
2453     wp += sprintf(wp, "%c%s", ESTPATHCHR, dbuf);
2454     free(dbuf);
2455     }
2456     *wp = '\0';
2457     cblistclose(list);
2458     return pbuf;
2459     }
2460    
2461    
2462     /* create a vector of keywords */
2463     static CBMAP *vectorizer(void *db, int id, void *kwdb){
2464     CBMAP *kwords;
2465     char *mbuf;
2466     int msiz;
2467     if(!(mbuf = crget((CURIA *)kwdb, (char *)&id, sizeof(int), 0, -1, &msiz))) return NULL;
2468     kwords = cbmapload(mbuf, msiz);
2469     free(mbuf);
2470     return kwords;
2471     }
2472    
2473    
2474     /* create a document object with an outer command */
2475     static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
2476     const char *xcmd, const char *tmpdir,
2477     const char *penc, int plang){
2478     ESTDOC *doc;
2479     const char *pv, *ext;
2480     char iname[URIBUFSIZ], oname[URIBUFSIZ], ebuf[URIBUFSIZ], cmd[URIBUFSIZ];
2481     char *rbuf, numbuf[NUMBUFSIZ];
2482     int fmt, rsiz;
2483     assert(buf && size >= 0 && path && xcmd && tmpdir);
2484     sprintf(ebuf, "ESTORIGFILE=%s", path);
2485     ext = NULL;
2486     if((pv = strrchr(path, ESTPATHCHR)) != NULL) path = pv;
2487     if((pv = strrchr(path, ESTEXTCHR)) != NULL) ext = pv;
2488     if(!ext) ext = "";
2489     sprintf(iname, "%s%cxcmd-in-%08d%s", tmpdir, ESTPATHCHR, getpid(), ext);
2490     sprintf(oname, "%s%cxcmd-out-%08d%cest", tmpdir, ESTPATHCHR, getpid(), ESTEXTCHR);
2491     fmt = FF_DRAFT;
2492     if(cbstrfwmatch(xcmd, "T@")){
2493     fmt = FF_TEXT;
2494     xcmd += 2;
2495     } else if(cbstrfwmatch(xcmd, "H@")){
2496     fmt = FF_HTML;
2497     xcmd += 2;
2498     } else if(cbstrfwmatch(xcmd, "M@")){
2499     fmt = FF_MIME;
2500     xcmd += 2;
2501     }
2502     sprintf(cmd, "%s %s %s", xcmd, iname, oname);
2503     if(!g_filtorig) cbwritefile(iname, buf, size);
2504     putenv(ebuf);
2505     system(cmd);
2506     if((rbuf = cbreadfile(oname, &rsiz)) != NULL){
2507     switch(fmt){
2508     case FF_TEXT:
2509     doc = est_doc_new_from_text(rbuf, rsiz, penc, plang);
2510     break;
2511     case FF_HTML:
2512     doc = est_doc_new_from_html(rbuf, rsiz, penc, plang);
2513     break;
2514     case FF_MIME:
2515     doc = est_doc_new_from_mime(rbuf, rsiz, penc, plang);
2516     break;
2517     default:
2518     doc = est_doc_new_from_draft_enc(rbuf, rsiz, penc);
2519     break;
2520     }
2521     free(rbuf);
2522     } else {
2523     doc = est_doc_new();
2524     }
2525     if(fmt != FF_DRAFT){
2526     sprintf(numbuf, "%d", size);
2527     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2528     est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext));
2529     }
2530     unlink(oname);
2531     unlink(iname);
2532     return doc;
2533     }
2534    
2535    
2536     /* create a document object from draft data in another encoding */
2537     static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc){
2538     ESTDOC *doc;
2539     char *rbuf;
2540     assert(buf);
2541     if(enc && (rbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL)) != NULL){
2542     doc = est_doc_new_from_draft(rbuf);
2543     free(rbuf);
2544     } else {
2545     doc = est_doc_new_from_draft(buf);
2546     }
2547     return doc;
2548     }
2549    
2550    
2551     /* create a document object from plain text */
2552     static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang){
2553     ESTDOC *doc;
2554     CBLIST *lines;
2555     CBDATUM *datum;
2556     const char *enc, *text, *line;
2557     char *nbuf, numbuf[NUMBUFSIZ];
2558     int i;
2559     assert(buf);
2560     doc = est_doc_new();
2561     enc = penc ? penc : est_enc_name(buf, size, plang);
2562     if(!strcmp(enc, "UTF-8")){
2563     nbuf = NULL;
2564     text = buf;
2565     } else {
2566     text = buf;
2567     nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2568     if(nbuf) text = nbuf;
2569     }
2570     lines = cbsplit(text, -1, "\n");
2571     datum = cbdatumopen("", 0);
2572     for(i = 0; i < CB_LISTNUM(lines); i++){
2573     line = CB_LISTVAL(lines, i, NULL);
2574     while(*line == ' ' || *line == '\t' || *line == '\r'){
2575     line++;
2576     }
2577     if(line[0] == '\0'){
2578     est_doc_add_text(doc, CB_DATUMPTR(datum));
2579     cbdatumsetsize(datum, 0);
2580     } else {
2581     cbdatumcat(datum, " ", 1);
2582     cbdatumcat(datum, line, -1);
2583     }
2584     }
2585     est_doc_add_text(doc, CB_DATUMPTR(datum));
2586     cbdatumclose(datum);
2587     cblistclose(lines);
2588     est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain");
2589     sprintf(numbuf, "%d", size);
2590     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2591     if(nbuf) free(nbuf);
2592     return doc;
2593     }
2594    
2595    
2596     /* create a document object from HTML */
2597     static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang){
2598     ESTDOC *doc;
2599     CBLIST *elems;
2600     CBMAP *attrs;
2601     CBDATUM *datum;
2602     const char *enc, *html, *elem, *next, *name, *content;
2603     char *nbuf, *nenc, *rbuf, *lbuf, numbuf[NUMBUFSIZ];
2604     int i, esiz;
2605     assert(buf);
2606     doc = est_doc_new();
2607     enc = est_enc_name(buf, size, plang);
2608     html = NULL;
2609     nbuf = NULL;
2610     if(!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")){
2611     nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2612     } else if(!strcmp(enc, "US-ASCII")){
2613     nbuf = NULL;
2614     } else {
2615     if((nenc = penc ? cbmemdup(penc, -1) : est_html_enc(buf)) != NULL){
2616     if(cbstricmp(nenc, "UTF-8")){
2617     nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL);
2618     if(!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2619     }
2620     free(nenc);
2621     } else {
2622     nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2623     }
2624     }
2625     if(nbuf) html = nbuf;
2626     if(!html) html = buf;
2627     datum = cbdatumopen("", 0);
2628     elems = cbxmlbreak(html, TRUE);
2629     for(i = 0; i < CB_LISTNUM(elems); i++){
2630     elem = CB_LISTVAL2(elems, i, &esiz);
2631     if(!(next = cblistval(elems, i + 1, NULL))) next = "";
2632     if(elem[0] == '<'){
2633     if(cbstrfwimatch(elem, "<meta")){
2634     attrs = cbxmlattrs(elem);
2635     name = cbmapget(attrs, "name", -1, NULL);
2636     if(!name) name = cbmapget(attrs, "Name", -1, NULL);
2637     if(!name) name = cbmapget(attrs, "NAME", -1, NULL);
2638     if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL);
2639     if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL);
2640     if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL);
2641     if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2642     content = cbmapget(attrs, "content", -1, NULL);
2643     if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2644     if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2645     if(name && content){
2646     lbuf = cbmemdup(name, -1);
2647     cbstrtolower(lbuf);
2648     cbstrsqzspc(lbuf);
2649     if(!strcmp(lbuf, "author")){
2650     if(strchr(content, '&')){
2651     rbuf = est_html_raw_text(content);
2652     est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
2653     free(rbuf);
2654     } else {
2655     est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
2656     }
2657     }
2658     if(name[0] != '@'){
2659     if(strchr(content, '&')){
2660     rbuf = est_html_raw_text(content);
2661     est_doc_add_attr(doc, lbuf, rbuf);
2662     free(rbuf);
2663     } else {
2664     est_doc_add_attr(doc, lbuf, content);
2665     }
2666     }
2667     free(lbuf);
2668     }
2669     cbmapclose(attrs);
2670     } else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){
2671     if(strchr(next, '&')){
2672     rbuf = est_html_raw_text(next);
2673     est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
2674     est_doc_add_hidden_text(doc, rbuf);
2675     free(rbuf);
2676     } else {
2677     est_doc_add_attr(doc, ESTDATTRTITLE, next);
2678     est_doc_add_hidden_text(doc, next);
2679     }
2680     i++;
2681     } else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){
2682     i++;
2683     } else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") ||
2684     cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") ||
2685     cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") ||
2686     cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") ||
2687     cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") ||
2688     cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") ||
2689     cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") ||
2690     cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") ||
2691     cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") ||
2692     cbstrfwimatch(elem, "<pre")){
2693     if(strchr(CB_DATUMPTR(datum), '&')){
2694     rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2695     est_doc_add_text(doc, rbuf);
2696     free(rbuf);
2697     } else {
2698     est_doc_add_text(doc, CB_DATUMPTR(datum));
2699     }
2700     cbdatumsetsize(datum, 0);
2701     }
2702     } else {
2703     cbdatumcat(datum, " ", -1);
2704     cbdatumcat(datum, elem, esiz);
2705     }
2706     }
2707     cblistclose(elems);
2708     if(strchr(CB_DATUMPTR(datum), '&')){
2709     rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2710     est_doc_add_text(doc, rbuf);
2711     free(rbuf);
2712     } else {
2713     est_doc_add_text(doc, CB_DATUMPTR(datum));
2714     }
2715     cbdatumclose(datum);
2716     if(nbuf) free(nbuf);
2717     est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
2718     sprintf(numbuf, "%d", size);
2719     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2720     return doc;
2721     }
2722    
2723    
2724     /* get the encoding of an HTML string */
2725     static char *est_html_enc(const char *str){
2726     CBLIST *elems;
2727     CBMAP *attrs;
2728     const char *elem, *equiv, *content;
2729     char *enc, *pv;
2730     int i;
2731     assert(str);
2732     elems = cbxmlbreak(str, TRUE);
2733     for(i = 0; i < CB_LISTNUM(elems); i++){
2734     elem = CB_LISTVAL(elems, i, NULL);
2735     if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue;
2736     enc = NULL;
2737     attrs = cbxmlattrs(elem);
2738     equiv = cbmapget(attrs, "http-equiv", -1, NULL);
2739     if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2740     if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL);
2741     if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL);
2742     if(equiv && !cbstricmp(equiv, "Content-Type")){
2743     content = cbmapget(attrs, "content", -1, NULL);
2744     if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2745     if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2746     if(content && ((pv = strstr(content, "charset")) != NULL ||
2747     (pv = strstr(content, "Charset")) != NULL ||
2748     (pv = strstr(content, "CHARSET")) != NULL)){
2749     enc = cbmemdup(pv + 8, -1);
2750     if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL ||
2751     (pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0';
2752     }
2753     }
2754     cbmapclose(attrs);
2755     if(enc){
2756     cblistclose(elems);
2757     return enc;
2758     }
2759     }
2760     cblistclose(elems);
2761     return NULL;
2762     }
2763    
2764    
2765     /* unescape entity references of HTML */
2766     static char *est_html_raw_text(const char *html){
2767     static const char *pairs[] = {
2768     /* basic symbols */
2769     "&amp;", "&", "&lt;", "<", "&gt;", ">", "&quot;", "\"", "&apos;", "'",
2770     /* ISO-8859-1 */
2771     "&nbsp;", "\xc2\xa0", "&iexcl;", "\xc2\xa1", "&cent;", "\xc2\xa2",
2772     "&pound;", "\xc2\xa3", "&curren;", "\xc2\xa4", "&yen;", "\xc2\xa5",
2773     "&brvbar;", "\xc2\xa6", "&sect;", "\xc2\xa7", "&uml;", "\xc2\xa8",
2774     "&copy;", "\xc2\xa9", "&ordf;", "\xc2\xaa", "&laquo;", "\xc2\xab",
2775     "&not;", "\xc2\xac", "&shy;", "\xc2\xad", "&reg;", "\xc2\xae",
2776     "&macr;", "\xc2\xaf", "&deg;", "\xc2\xb0", "&plusmn;", "\xc2\xb1",
2777     "&sup2;", "\xc2\xb2", "&sup3;", "\xc2\xb3", "&acute;", "\xc2\xb4",
2778     "&micro;", "\xc2\xb5", "&para;", "\xc2\xb6", "&middot;", "\xc2\xb7",
2779     "&cedil;", "\xc2\xb8", "&sup1;", "\xc2\xb9", "&ordm;", "\xc2\xba",
2780     "&raquo;", "\xc2\xbb", "&frac14;", "\xc2\xbc", "&frac12;", "\xc2\xbd",
2781     "&frac34;", "\xc2\xbe", "&iquest;", "\xc2\xbf", "&Agrave;", "\xc3\x80",
2782     "&Aacute;", "\xc3\x81", "&Acirc;", "\xc3\x82", "&Atilde;", "\xc3\x83",
2783     "&Auml;", "\xc3\x84", "&Aring;", "\xc3\x85", "&AElig;", "\xc3\x86",
2784     "&Ccedil;", "\xc3\x87", "&Egrave;", "\xc3\x88", "&Eacute;", "\xc3\x89",
2785     "&Ecirc;", "\xc3\x8a", "&Euml;", "\xc3\x8b", "&Igrave;", "\xc3\x8c",
2786     "&Iacute;", "\xc3\x8d", "&Icirc;", "\xc3\x8e", "&Iuml;", "\xc3\x8f",
2787     "&ETH;", "\xc3\x90", "&Ntilde;", "\xc3\x91", "&Ograve;", "\xc3\x92",
2788     "&Oacute;", "\xc3\x93", "&Ocirc;", "\xc3\x94", "&Otilde;", "\xc3\x95",
2789     "&Ouml;", "\xc3\x96", "&times;", "\xc3\x97", "&Oslash;", "\xc3\x98",
2790     "&Ugrave;", "\xc3\x99", "&Uacute;", "\xc3\x9a", "&Ucirc;", "\xc3\x9b",
2791     "&Uuml;", "\xc3\x9c", "&Yacute;", "\xc3\x9d", "&THORN;", "\xc3\x9e",
2792     "&szlig;", "\xc3\x9f", "&agrave;", "\xc3\xa0", "&aacute;", "\xc3\xa1",
2793     "&acirc;", "\xc3\xa2", "&atilde;", "\xc3\xa3", "&auml;", "\xc3\xa4",
2794     "&aring;", "\xc3\xa5", "&aelig;", "\xc3\xa6", "&ccedil;", "\xc3\xa7",
2795     "&egrave;", "\xc3\xa8", "&eacute;", "\xc3\xa9", "&ecirc;", "\xc3\xaa",
2796     "&euml;", "\xc3\xab", "&igrave;", "\xc3\xac", "&iacute;", "\xc3\xad",
2797     "&icirc;", "\xc3\xae", "&iuml;", "\xc3\xaf", "&eth;", "\xc3\xb0",
2798     "&ntilde;", "\xc3\xb1", "&ograve;", "\xc3\xb2", "&oacute;", "\xc3\xb3",
2799     "&ocirc;", "\xc3\xb4", "&otilde;", "\xc3\xb5", "&ouml;", "\xc3\xb6",
2800     "&divide;", "\xc3\xb7", "&oslash;", "\xc3\xb8", "&ugrave;", "\xc3\xb9",
2801     "&uacute;", "\xc3\xba", "&ucirc;", "\xc3\xbb", "&uuml;", "\xc3\xbc",
2802     "&yacute;", "\xc3\xbd", "&thorn;", "\xc3\xbe", "&yuml;", "\xc3\xbf",
2803     /* ISO-10646 */
2804     "&fnof;", "\xc6\x92", "&Alpha;", "\xce\x91", "&Beta;", "\xce\x92",
2805     "&Gamma;", "\xce\x93", "&Delta;", "\xce\x94", "&Epsilon;", "\xce\x95",
2806     "&Zeta;", "\xce\x96", "&Eta;", "\xce\x97", "&Theta;", "\xce\x98",
2807     "&Iota;", "\xce\x99", "&Kappa;", "\xce\x9a", "&Lambda;", "\xce\x9b",
2808     "&Mu;", "\xce\x9c", "&Nu;", "\xce\x9d", "&Xi;", "\xce\x9e",
2809     "&Omicron;", "\xce\x9f", "&Pi;", "\xce\xa0", "&Rho;", "\xce\xa1",
2810     "&Sigma;", "\xce\xa3", "&Tau;", "\xce\xa4", "&Upsilon;", "\xce\xa5",
2811     "&Phi;", "\xce\xa6", "&Chi;", "\xce\xa7", "&Psi;", "\xce\xa8",
2812     "&Omega;", "\xce\xa9", "&alpha;", "\xce\xb1", "&beta;", "\xce\xb2",
2813     "&gamma;", "\xce\xb3", "&delta;", "\xce\xb4", "&epsilon;", "\xce\xb5",
2814     "&zeta;", "\xce\xb6", "&eta;", "\xce\xb7", "&theta;", "\xce\xb8",
2815     "&iota;", "\xce\xb9", "&kappa;", "\xce\xba", "&lambda;", "\xce\xbb",
2816     "&mu;", "\xce\xbc", "&nu;", "\xce\xbd", "&xi;", "\xce\xbe",
2817     "&omicron;", "\xce\xbf", "&pi;", "\xcf\x80", "&rho;", "\xcf\x81",
2818     "&sigmaf;", "\xcf\x82", "&sigma;", "\xcf\x83", "&tau;", "\xcf\x84",
2819     "&upsilon;", "\xcf\x85", "&phi;", "\xcf\x86", "&chi;", "\xcf\x87",
2820     "&psi;", "\xcf\x88", "&omega;", "\xcf\x89", "&thetasym;", "\xcf\x91",
2821     "&upsih;", "\xcf\x92", "&piv;", "\xcf\x96", "&bull;", "\xe2\x80\xa2",
2822     "&hellip;", "\xe2\x80\xa6", "&prime;", "\xe2\x80\xb2", "&Prime;", "\xe2\x80\xb3",
2823     "&oline;", "\xe2\x80\xbe", "&frasl;", "\xe2\x81\x84", "&weierp;", "\xe2\x84\x98",
2824     "&image;", "\xe2\x84\x91", "&real;", "\xe2\x84\x9c", "&trade;", "\xe2\x84\xa2",
2825     "&alefsym;", "\xe2\x84\xb5", "&larr;", "\xe2\x86\x90", "&uarr;", "\xe2\x86\x91",
2826     "&rarr;", "\xe2\x86\x92", "&darr;", "\xe2\x86\x93", "&harr;", "\xe2\x86\x94",
2827     "&crarr;", "\xe2\x86\xb5", "&lArr;", "\xe2\x87\x90", "&uArr;", "\xe2\x87\x91",
2828     "&rArr;", "\xe2\x87\x92", "&dArr;", "\xe2\x87\x93", "&hArr;", "\xe2\x87\x94",
2829     "&forall;", "\xe2\x88\x80", "&part;", "\xe2\x88\x82", "&exist;", "\xe2\x88\x83",
2830     "&empty;", "\xe2\x88\x85", "&nabla;", "\xe2\x88\x87", "&isin;", "\xe2\x88\x88",
2831     "&notin;", "\xe2\x88\x89", "&ni;", "\xe2\x88\x8b", "&prod;", "\xe2\x88\x8f",
2832     "&sum;", "\xe2\x88\x91", "&minus;", "\xe2\x88\x92", "&lowast;", "\xe2\x88\x97",
2833     "&radic;", "\xe2\x88\x9a", "&prop;", "\xe2\x88\x9d", "&infin;", "\xe2\x88\x9e",
2834     "&ang;", "\xe2\x88\xa0", "&and;", "\xe2\x88\xa7", "&or;", "\xe2\x88\xa8",
2835     "&cap;", "\xe2\x88\xa9", "&cup;", "\xe2\x88\xaa", "&int;", "\xe2\x88\xab",
2836     "&there4;", "\xe2\x88\xb4", "&sim;", "\xe2\x88\xbc", "&cong;", "\xe2\x89\x85",
2837     "&asymp;", "\xe2\x89\x88", "&ne;", "\xe2\x89\xa0", "&equiv;", "\xe2\x89\xa1",
2838     "&le;", "\xe2\x89\xa4", "&ge;", "\xe2\x89\xa5", "&sub;", "\xe2\x8a\x82",
2839     "&sup;", "\xe2\x8a\x83", "&nsub;", "\xe2\x8a\x84", "&sube;", "\xe2\x8a\x86",
2840     "&supe;", "\xe2\x8a\x87", "&oplus;", "\xe2\x8a\x95", "&otimes;", "\xe2\x8a\x97",
2841     "&perp;", "\xe2\x8a\xa5", "&sdot;", "\xe2\x8b\x85", "&lceil;", "\xe2\x8c\x88",
2842     "&rceil;", "\xe2\x8c\x89", "&lfloor;", "\xe2\x8c\x8a", "&rfloor;", "\xe2\x8c\x8b",
2843     "&lang;", "\xe2\x8c\xa9", "&rang;", "\xe2\x8c\xaa", "&loz;", "\xe2\x97\x8a",
2844     "&spades;", "\xe2\x99\xa0", "&clubs;", "\xe2\x99\xa3", "&hearts;", "\xe2\x99\xa5",
2845     "&diams;", "\xe2\x99\xa6", "&OElig;", "\xc5\x92", "&oelig;", "\xc5\x93",
2846     "&Scaron;", "\xc5\xa0", "&scaron;", "\xc5\xa1", "&Yuml;", "\xc5\xb8",
2847     "&circ;", "\xcb\x86", "&tilde;", "\xcb\x9c", "&ensp;", "\xe2\x80\x82",
2848     "&emsp;", "\xe2\x80\x83", "&thinsp;", "\xe2\x80\x89", "&zwnj;", "\xe2\x80\x8c",
2849     "&zwj;", "\xe2\x80\x8d", "&lrm;", "\xe2\x80\x8e", "&rlm;", "\xe2\x80\x8f",
2850     "&ndash;", "\xe2\x80\x93", "&mdash;", "\xe2\x80\x94", "&lsquo;", "\xe2\x80\x98",
2851     "&rsquo;", "\xe2\x80\x99", "&sbquo;", "\xe2\x80\x9a", "&ldquo;", "\xe2\x80\x9c",
2852     "&rdquo;", "\xe2\x80\x9d", "&bdquo;", "\xe2\x80\x9e", "&dagger;", "\xe2\x80\xa0",
2853     "&Dagger;", "\xe2\x80\xa1", "&permil;", "\xe2\x80\xb0", "&lsaquo;", "\xe2\x80\xb9",
2854     "&rsaquo;", "\xe2\x80\xba", "&euro;", "\xe2\x82\xac",
2855     NULL
2856     };
2857     char *raw, *wp, buf[2], *tmp;
2858     int i, j, hit, num, tsiz;
2859     assert(html);
2860     CB_MALLOC(raw, strlen(html) * 3 + 1);
2861     wp = raw;
2862     while(*html != '\0'){
2863     if(*html == '&'){
2864     if(*(html + 1) == '#'){
2865     if(*(html + 2) == 'x' || *(html + 2) == 'X'){
2866     num = strtol(html + 3, NULL, 16);
2867     } else {
2868     num = atoi(html + 2);
2869     }
2870     buf[0] = num / 256;
2871     buf[1] = num % 256;
2872     if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){
2873     for(j = 0; j < tsiz; j++){
2874     *wp = ((unsigned char *)tmp)[j];
2875     wp++;
2876     }
2877     free(tmp);
2878     }
2879     while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
2880     html++;
2881     }
2882     if(*html == ';') html++;
2883     } else {
2884     hit = FALSE;
2885     for(i = 0; pairs[i] != NULL; i += 2){
2886     if(cbstrfwmatch(html, pairs[i])){
2887     wp += sprintf(wp, "%s", pairs[i+1]);
2888     html += strlen(pairs[i]);
2889     hit = TRUE;
2890     break;
2891     }
2892     }
2893     if(!hit){
2894     *wp = *html;
2895     wp++;
2896     html++;
2897     }
2898     }
2899     } else {
2900     *wp = *html;
2901     wp++;
2902     html++;
2903     }
2904     }
2905     *wp = '\0';
2906     return raw;
2907     }
2908    
2909    
2910     /* create a document object from MIME */
2911     static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang){
2912     ESTDOC *doc, *tdoc;
2913     CBMAP *attrs;
2914     const CBLIST *texts;
2915     CBLIST *parts, *lines;
2916     CBDATUM *datum;
2917     const char *key, *val, *bound, *part, *text, *line;
2918     char *body, *swap, numbuf[NUMBUFSIZ];
2919     int i, j, bsiz, psiz, ssiz, mht;
2920     assert(buf);
2921     doc = est_doc_new();
2922     attrs = cbmapopenex(MINIBNUM);
2923     body = cbmimebreak(buf, size, attrs, &bsiz);
2924     if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){
2925     est_doc_add_attr_mime(doc, ESTDATTRTITLE, val);
2926     if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val);
2927     }
2928     if((val = cbmapget(attrs, "from", -1, NULL)) != NULL)
2929     est_doc_add_attr_mime(doc, ESTDATTRAUTHOR, val);
2930     if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){
2931     est_doc_add_attr_mime(doc, ESTDATTRCDATE, val);
2932     est_doc_add_attr_mime(doc, ESTDATTRMDATE, val);
2933     }
2934     est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822");
2935     sprintf(numbuf, "%d", size);
2936     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2937     cbmapiterinit(attrs);
2938     while((key = cbmapiternext(attrs, NULL)) != NULL){
2939     if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@') continue;
2940     val = cbmapget(attrs, key, -1, NULL);
2941     est_doc_add_attr_mime(doc, key, val);
2942     }
2943     if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){
2944     mht = cbstrfwimatch(key, "multipart/related");
2945     if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){
2946     parts = cbmimeparts(body, bsiz, bound);
2947     for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){
2948     part = CB_LISTVAL2(parts, i, &psiz);
2949     tdoc = est_doc_new_from_mime(part, psiz, penc, plang);
2950     if(mht){
2951     if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL)
2952     est_doc_add_attr(doc, ESTDATTRTITLE, text);
2953     if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL)
2954     est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
2955     }
2956     texts = est_doc_texts(tdoc);
2957     for(j = 0; j < CB_LISTNUM(texts); j++){
2958     text = CB_LISTVAL(texts, j, NULL);
2959     est_doc_add_text(doc, text);
2960     }
2961     est_doc_delete(tdoc);
2962     }
2963     cblistclose(parts);
2964     }
2965     } else {
2966     if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2967     cbstrfwimatch(key, "base64")){
2968     swap = cbbasedecode(body, &ssiz);
2969     free(body);
2970     body = swap;
2971     bsiz = ssiz;
2972     } else if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2973     cbstrfwimatch(key, "quoted-printable")){
2974     swap = cbquotedecode(body, &ssiz);
2975     free(body);
2976     body = swap;
2977     bsiz = ssiz;
2978     }
2979     if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){
2980     if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){
2981     free(body);
2982     body = swap;
2983     bsiz = ssiz;
2984     } else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL &&
2985     (swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){
2986     free(body);
2987     body = swap;
2988     bsiz = ssiz;
2989     }
2990     lines = cbsplit(body, bsiz, "\n");
2991     datum = cbdatumopen("", 0);
2992     for(i = 0; i < CB_LISTNUM(lines); i++){
2993     line = CB_LISTVAL(lines, i, NULL);
2994     while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){
2995     line++;
2996     }
2997     if(line[0] == '\0'){
2998     est_doc_add_text(doc, CB_DATUMPTR(datum));
2999     cbdatumsetsize(datum, 0);
3000     } else {
3001     cbdatumcat(datum, " ", 1);
3002     cbdatumcat(datum, line, -1);
3003     }
3004     }
3005     est_doc_add_text(doc, CB_DATUMPTR(datum));
3006     cbdatumclose(datum);
3007     cblistclose(lines);
3008     } else if(cbstrfwimatch(key, "text/html")){
3009     tdoc = est_doc_new_from_html(body, bsiz, penc, plang);
3010     if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3011     if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3012     est_doc_add_text(doc, text);
3013     }
3014     if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3015     if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3016     est_doc_add_text(doc, text);
3017     }
3018     texts = est_doc_texts(tdoc);
3019     for(i = 0; i < CB_LISTNUM(texts); i++){
3020     text = CB_LISTVAL(texts, i, NULL);
3021     est_doc_add_text(doc, text);
3022     }
3023     est_doc_delete(tdoc);
3024     } else if(cbstrfwimatch(key, "message/rfc822")){
3025     tdoc = est_doc_new_from_mime(body, bsiz, penc, plang);
3026     if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3027     if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3028     est_doc_add_text(doc, text);
3029     }
3030     if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3031     if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3032     est_doc_add_text(doc, text);
3033     }
3034     texts = est_doc_texts(tdoc);
3035     for(i = 0; i < CB_LISTNUM(texts); i++){
3036     text = CB_LISTVAL(texts, i, NULL);
3037     est_doc_add_text(doc, text);
3038     }
3039     est_doc_delete(tdoc);
3040     } else if(cbstrfwimatch(key, "text/")){
3041     tdoc = est_doc_new_from_text(body, bsiz, penc, plang);
3042     texts = est_doc_texts(tdoc);
3043     for(i = 0; i < CB_LISTNUM(texts); i++){
3044     text = CB_LISTVAL(texts, i, NULL);
3045     est_doc_add_text(doc, text);
3046     }
3047     est_doc_delete(tdoc);
3048     }
3049     }
3050     free(body);
3051     cbmapclose(attrs);
3052     return doc;
3053     }
3054    
3055    
3056     /* set mime value as an attribute of a document */
3057     static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){
3058     char enc[64], *ebuf, *rbuf;
3059     assert(doc && name && value);
3060     ebuf = cbmimedecode(value, enc);
3061     if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){
3062     est_doc_add_attr(doc, name, rbuf);
3063     free(rbuf);
3064     }
3065     free(ebuf);
3066     }
3067    
3068    
3069     /* generate a document with random text */
3070     static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode){
3071     ESTDOC *doc;
3072     char *str;
3073     int i;
3074     doc = est_doc_new();
3075     snum *= pow(est_random_nd() + 0.5, 3.0);
3076     if(mode == RD_RAND){
3077     mode = est_random() * 100;
3078     if(mode < 20){
3079     mode = RD_ENG;
3080     est_doc_add_attr(doc, "mode", "english");
3081     } else if(mode < 40){
3082     mode = RD_LAT;
3083     est_doc_add_attr(doc, "mode", "latin");
3084     } else if(mode < 60){
3085     mode = RD_EURO;
3086     est_doc_add_attr(doc, "mode", "euromix");
3087     } else if(mode < 65){
3088     mode = RD_ORI;
3089     est_doc_add_attr(doc, "mode", "oriental");
3090     } else if(mode < 95){
3091     mode = RD_JPN;
3092     est_doc_add_attr(doc, "mode", "japanese");
3093     } else {
3094     mode = RD_CHAO;
3095     est_doc_add_attr(doc, "mode", "chaos");
3096     }
3097     }
3098     switch(mode){
3099     case RD_ENG: est_doc_add_attr(doc, "mode", "english"); break;
3100     case RD_LAT: est_doc_add_attr(doc, "mode", "latin"); break;
3101     case RD_ORI: est_doc_add_attr(doc, "mode", "oriental"); break;
3102     case RD_JPN: est_doc_add_attr(doc, "mode", "japanese"); break;
3103     case RD_EURO: est_doc_add_attr(doc, "mode", "euromix"); break;
3104     case RD_CHAO: est_doc_add_attr(doc, "mode", "chaos"); break;
3105     }
3106     for(i = 0; i <= snum; i++){
3107     str = est_random_str(cnum, mode);
3108     if(est_random() < 0.05){
3109     est_doc_add_hidden_text(doc, str);
3110     } else {
3111     est_doc_add_text(doc, str);
3112     }
3113     free(str);
3114     }
3115     return doc;
3116     }
3117    
3118    
3119     /* generate random string */
3120     static char *est_random_str(int cnum, int mode){
3121     const char echrs[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
3122     CBDATUM *buf;
3123     char wc[2], *str;
3124     int i, c, wlen, dec, mm, big, n;
3125     buf = cbdatumopen("", 0);
3126     cnum *= pow(est_random_nd() + 0.5, 3.0);
3127     wlen = est_random_nd() * 8 + 4;
3128     dec = (int)(est_random() * INT_MAX) % 10;
3129     big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3130     for(i = 0; i < cnum; i++){
3131     switch(mode){
3132     case RD_ENG: case RD_LAT: case RD_EURO:
3133     mm = (int)(est_random() * INT_MAX) % 100;
3134     if((mode == RD_LAT || mode == RD_EURO) && mm < 5){
3135     c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3136     } else if(mode == RD_EURO && (mm < 30 || dec > 8)){
3137     if(dec % 2 == 0){
3138     c = 0x0391 + (int)(pow(est_random_nd(), 2.0) * (0x03d6 - 0x0391));
3139     } else {
3140     c = 0x0400 + (int)(pow(est_random_nd(), 2.0) * (0x045f - 0x0400));
3141     }
3142     } else if(mm < 95){
3143     if((n = est_random_nd() * (sizeof(echrs) - 1)) == (sizeof(echrs) - 1)) n = 0;
3144     c = echrs[n];
3145     } else {
3146     c = (int)(est_random() * ('@' - ' ')) + ' ';
3147     }
3148     if(--wlen < 1){
3149     c = ' ';
3150     wlen = pow(est_random_nd(), 3.0) * 8 + 4;
3151     dec = (int)(est_random() * INT_MAX) % 10;
3152     }
3153     break;
3154     case RD_ORI:
3155     c = big + est_random_nd() * 0x100;
3156     if(--wlen < 1){
3157     wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3158     big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3159     }
3160     break;
3161     case RD_JPN:
3162     if(dec < 4){
3163     c = 0x3041 + pow(est_random_nd(), 3.0) * (0x3094 - 0x3041);
3164     } else if(dec < 7){
3165     c = 0x30a1 + pow(est_random_nd(), 3.0) * (0x30fe - 0x30a1);
3166     } else if(dec < 9){
3167     c = 0x4e00 + pow(est_random_nd(), 3.0) * (0x9faf - 0x4e00);
3168     } else {
3169     if(est_random() < 0.7){
3170     c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3171     } else {
3172     c = 0x3041 + est_random() * (0xffef - 0x3041);
3173     }
3174     }
3175     if(--wlen < 1){
3176     wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3177     dec = (int)(est_random() * INT_MAX) % 10;
3178     }
3179     break;
3180     default:
3181     if(est_random() < 0.2){
3182     c = 0x00a1 + (int)est_random() * (0x00ff - 0x00a0);
3183     } else {
3184     c = (int)(est_random() * 0x10000);
3185     }
3186     break;
3187     }
3188     if(c <= 0 || c >= 0x10000) c = 0x0020;
3189     wc[0] = c / 0x100;
3190     wc[1] = c % 0x100;
3191     cbdatumcat(buf, wc, 2);
3192     }
3193     str = est_iconv(CB_DATUMPTR(buf), CB_DATUMSIZE(buf), "UTF-16BE", "UTF-8", NULL, NULL);
3194     cbdatumclose(buf);
3195     return str;
3196     }
3197    
3198    
3199    
3200     /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26