/[hyperestraier]/upstream/0.5.3/estcmd.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /upstream/0.5.3/estcmd.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (hide annotations)
Wed Aug 3 15:25:48 2005 UTC (18 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 106455 byte(s)
import of upstream 0.5.3

1 dpavlin 2 /*************************************************************************************************
2     * The command line interface for the core API
3     * Copyright (C) 2004-2005 Mikio Hirabayashi
4     * This file is part of Hyper Estraier.
5     * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6     * the GNU Lesser General Public License as published by the Free Software Foundation; either
7     * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8     * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10     * License for more details.
11     * You should have received a copy of the GNU Lesser General Public License along with Hyper
12     * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13     * Boston, MA 02111-1307 USA.
14     *************************************************************************************************/
15    
16    
17     #include "estraier.h"
18     #include "myconf.h"
19    
20     #define NUMBUFSIZ 32 /* size of a buffer for a number */
21     #define URIBUFSIZ 8192 /* size of a buffer for an URI */
22     #define MINIBNUM 31 /* bucket number of a small map */
23     #define SEARCHMAX 10 /* maximum number of shown documents */
24     #define SNIPWWIDTH 480 /* whole width of the snippet */
25     #define SNIPHWIDTH 96 /* width of beginning of the text */
26     #define SNIPAWIDTH 96 /* width around each highlighted word */
27     #define CACHEMAX (512*1024*1024) /* max chache size by mega bytes */
28     #define DATTRLPATH "_lpath" /* name of the attribute of the local path */
29     #define DATTRLFILE "_lfile" /* name of the attribute of the local file name */
30     #define DATTRSCORE "#score" /* name of the pseudo-attribute of score */
31     #define DATTRKWORDS "#kwords" /* name of the pseudo-attribute of keywords */
32     #define KWDBNAME "kwords" /* name of the database for keywords */
33     #define KWDBBNUM 122869 /* bucket number of the keyword database */
34     #define KWDBDNUM 3 /* division number of the keyword database */
35     #define KWORDNUM 32 /* number of shown keywords */
36     #define RDOCSNUM 6 /* number of sections of a raondom document */
37     #define RDOCCNUM 256 /* number of characters for int a section */
38    
39     enum { /* enumeration for viewing modes */
40     VM_ID, /* ID only */
41     VM_URI, /* ID and URI */
42     VM_ATTR, /* all attributes */
43     VM_FULL, /* all attributes and body text */
44     VM_SNIP, /* all attributes and snippet */
45     VM_HMRD, /* human readable */
46     VM_XML, /* XML */
47     VM_DUMP /* dump draft files */
48     };
49    
50     enum { /* enumeration for file formats */
51     FF_AUTO, /* automatic detection */
52     FF_DRAFT, /* draft */
53     FF_TEXT, /* plain text */
54     FF_HTML, /* HTML */
55     FF_MIME, /* MIME */
56     FF_NONE /* ignored */
57     };
58    
59     enum { /* enumeration for test documents */
60     RD_ENG, /* English */
61     RD_LAT, /* Latin */
62     RD_EURO, /* European mix */
63     RD_ORI, /* Oriental */
64     RD_JPN, /* Japanese */
65     RD_CHAO, /* chaos */
66     RD_RAND /* selected at random */
67     };
68    
69    
70     /* global variables */
71     const char *g_progname; /* program name */
72     int g_sigterm = FALSE; /* flag for termination signal */
73     int g_putopts = 0; /* options of registration */
74     int g_outopts = 0; /* options of deletion */
75     int g_optopts = 0; /* options of optimization */
76     const char *g_inputcode = "UTF-8"; /* input encoding */
77     int g_inputlang = ESTLANGEN; /* prefered language */
78     const char *g_pathcode = NULL; /* path encoding */
79     int g_pathfull = FALSE; /* whether to record full paths */
80 dpavlin 10 CBLIST *g_pathattrs = NULL; /* names of elements in path extension */
81 dpavlin 2 int g_oextmodes = 0; /* extra open modes */
82     int g_viewmode = VM_ID; /* viewing mode */
83     int g_filefmt = FF_AUTO; /* file format */
84     CBMAP *g_xcmdmap = NULL; /* map of suffixes and filter commands */
85     int g_filtorig = FALSE; /* whether to use filter for original files */
86     int g_stdate = FALSE; /* whether to adopt date by stat */
87     int g_chkmdate = FALSE; /* whether to check modification date */
88     double g_cachesize = -1; /* size of the cache */
89     int g_doforce = FALSE; /* whether to force purging or extracting */
90     int g_kwordnum = KWORDNUM; /* number of keywords */
91     int g_rdmode = RD_RAND; /* mode of random documents */
92    
93    
94     /* function prototypes */
95     int main(int argc, char **argv);
96     static void printferror(const char *format, ...);
97     static void printfinfo(const char *format, ...);
98     static void dbinform(const char *msg);
99     static void setsignals(void);
100     static void sigtermhandler(int num);
101     static void usage(void);
102     static int runput(int argc, char **argv);
103     static int runout(int argc, char **argv);
104     static int runget(int argc, char **argv);
105     static int runlist(int argc, char **argv);
106     static int runuriid(int argc, char **argv);
107     static int runmeta(int argc, char **argv);
108     static int runinform(int argc, char **argv);
109     static int runoptimize(int argc, char **argv);
110     static int runsearch(int argc, char **argv);
111     static int rungather(int argc, char **argv);
112     static int runpurge(int argc, char **argv);
113     static int runextkeys(int argc, char **argv);
114     static int rundraft(int argc, char **argv);
115     static int runbreak(int argc, char **argv);
116     static int runrandput(int argc, char **argv);
117     static int runwicked(int argc, char **argv);
118     static int runregression(int argc, char **argv);
119     static int procput(const char *dbname, const char *filename);
120     static int procout(const char *dbname, int id, const char *expr);
121     static int procget(const char *dbname, int id, const char *expr, const char *attr);
122     static int proclist(const char *dbname);
123     static int procuriid(const char *dbname, const char *uri);
124     static int procmeta(const char *dbname, const char *mname, const char *mvalue);
125     static int procinform(const char *dbname);
126     static int procoptimize(const char *dbname);
127     static int procsearch(const char *dbname, const char *phrase,
128 dpavlin 10 const CBLIST *attrs, const char *ord, int max, int opts, int sim);
129 dpavlin 2 static int procgather(const char *dbname, const char *filename);
130     static int procpurge(const char *dbname, const char *prefix);
131     static int procextkeys(const char *dbname, const char *prefix, int ni);
132     static int procdraft(const char *filename);
133     static int procbreak(const char *filename, int wt);
134     static int procrandput(const char *dbname, int dnum);
135     static int procwicked(const char *dbname, int dnum);
136     static int procregression(const char *dbname);
137     static void xmlprintf(const char *format, ...);
138     static int strtolang(const char *str);
139     static char *fgetl(FILE *ifp);
140 dpavlin 10 static int doputdoc(ESTDB *db, const char *path, const CBLIST *attrs);
141 dpavlin 2 static const char *pathtourl(const char *path);
142     static const char *urltofile(const char *uri);
143     static char *urltopath(const char *uri);
144     static CBMAP *vectorizer(void *db, int id, void *kwdb);
145     static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
146     const char *xcmd, const char *tmpdir,
147     const char *penc, int plang);
148     static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc);
149     static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang);
150     static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang);
151     static char *est_html_enc(const char *str);
152     static char *est_html_raw_text(const char *html);
153     static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang);
154     static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value);
155     static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode);
156     static char *est_random_str(int cnum, int mode);
157    
158    
159     /* main routine */
160     int main(int argc, char **argv){
161     const char *tmp;
162     int rv;
163     if((tmp = getenv("ESTDBGFD")) != NULL) dpdbgfd = atoi(tmp);
164     cbstdiobin();
165     g_progname = argv[0];
166     g_sigterm = FALSE;
167     if(argc < 2) usage();
168     rv = 0;
169     if(!strcmp(argv[1], "put")){
170     setsignals();
171     rv = runput(argc, argv);
172     } else if(!strcmp(argv[1], "out")){
173     setsignals();
174     rv = runout(argc, argv);
175     } else if(!strcmp(argv[1], "get")){
176     rv = runget(argc, argv);
177     } else if(!strcmp(argv[1], "list")){
178     rv = runlist(argc, argv);
179     } else if(!strcmp(argv[1], "uriid")){
180     rv = runuriid(argc, argv);
181     } else if(!strcmp(argv[1], "meta")){
182     setsignals();
183     rv = runmeta(argc, argv);
184     } else if(!strcmp(argv[1], "inform")){
185     rv = runinform(argc, argv);
186     } else if(!strcmp(argv[1], "optimize")){
187     setsignals();
188     rv = runoptimize(argc, argv);
189     } else if(!strcmp(argv[1], "search")){
190     rv = runsearch(argc, argv);
191     } else if(!strcmp(argv[1], "gather")){
192     setsignals();
193     rv = rungather(argc, argv);
194     } else if(!strcmp(argv[1], "purge")){
195     setsignals();
196     rv = runpurge(argc, argv);
197     } else if(!strcmp(argv[1], "extkeys")){
198     setsignals();
199     rv = runextkeys(argc, argv);
200     } else if(!strcmp(argv[1], "draft")){
201     rv = rundraft(argc, argv);
202     } else if(!strcmp(argv[1], "break")){
203     rv = runbreak(argc, argv);
204     } else if(!strcmp(argv[1], "randput")){
205     setsignals();
206     rv = runrandput(argc, argv);
207     } else if(!strcmp(argv[1], "wicked")){
208     setsignals();
209     rv = runwicked(argc, argv);
210     } else if(!strcmp(argv[1], "regression")){
211     setsignals();
212     rv = runregression(argc, argv);
213     } else if(!strcmp(argv[1], "version") || !strcmp(argv[1], "--version")){
214     printf("Hyper Estraier %s on %s\n", est_version, ESTSYSNAME);
215     printf("Copyright (C) 2004-2005 Mikio Hirabayashi.\n");
216     rv = 0;
217     } else {
218     usage();
219     }
220     return rv;
221     }
222    
223    
224     /* print formatted error string and flush the buffer */
225     static void printferror(const char *format, ...){
226     va_list ap;
227     va_start(ap, format);
228     fprintf(stderr, "%s: ERROR: ", g_progname);
229     vfprintf(stderr, format, ap);
230     fputc('\n', stderr);
231     fflush(stderr);
232     va_end(ap);
233     }
234    
235    
236     /* print formatted information string and flush the buffer */
237     static void printfinfo(const char *format, ...){
238     va_list ap;
239     va_start(ap, format);
240     printf("%s: INFO: ", g_progname);
241     vprintf(format, ap);
242     putchar('\n');
243     fflush(stdout);
244     va_end(ap);
245     }
246    
247    
248     /* callback function for database events */
249     static void dbinform(const char *msg){
250     printfinfo("%s", msg);
251     }
252    
253    
254     /* set signal handlers */
255     static void setsignals(void){
256     signal(1, sigtermhandler);
257     signal(2, sigtermhandler);
258     signal(3, sigtermhandler);
259     signal(13, sigtermhandler);
260     signal(15, sigtermhandler);
261     }
262    
263    
264     /* handler of termination signal */
265     static void sigtermhandler(int num){
266     static int tries = 0;
267     if(tries++ <= 4){
268     signal(num, sigtermhandler);
269     } else {
270     signal(num, SIG_DFL);
271     }
272     g_sigterm = TRUE;
273     printfinfo("the termination signal %d catched", num);
274     }
275    
276    
277     /* print the usage and exit */
278     static void usage(void){
279     fprintf(stderr, "%s: command line utility for the core API of Hyper Estraier\n", g_progname);
280     fprintf(stderr, "\n");
281     fprintf(stderr, "usage:\n");
282     fprintf(stderr, " %s put [-cl] db [file]\n", g_progname);
283     fprintf(stderr, " %s out [-cl] db expr\n", g_progname);
284     fprintf(stderr, " %s get db expr\n", g_progname);
285     fprintf(stderr, " %s list db\n", g_progname);
286     fprintf(stderr, " %s uriid db uri\n", g_progname);
287     fprintf(stderr, " %s meta db [name [value]]\n", g_progname);
288     fprintf(stderr, " %s inform db\n", g_progname);
289     fprintf(stderr, " %s optimize [-onp] [-ond] db\n", g_progname);
290     fprintf(stderr, " %s search [-ic enc] [-vu|-va|-vf|-vs|-vh|-vx|-dd] [-gs|-gf|-ga]"
291     " [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n",
292     g_progname);
293     fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]"
294 dpavlin 10 " [-ic enc] [-il lang] [-pc enc] [-pf] [-px name] [-apn] [-sd] [-cm] [-cs num]"
295     " db [file|dir]\n", g_progname);
296 dpavlin 2 fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname);
297     fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname);
298     fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname);
299     fprintf(stderr, " %s break [-ic enc] [-il lang] [-apn] [-wt] [file]\n", g_progname);
300     fprintf(stderr, " %s randput [-ren|-rla|-reu|-ror|-rjp|-rch] [-cs num] db dnum\n",
301     g_progname);
302     fprintf(stderr, " %s wicked db dnum\n", g_progname);
303     fprintf(stderr, " %s regression db\n", g_progname);
304     fprintf(stderr, " %s version\n", g_progname);
305     fprintf(stderr, "\n");
306     exit(1);
307     }
308    
309    
310     /* parse arguments of the put command */
311     static int runput(int argc, char **argv){
312     char *dbname, *filename;
313     int i, rv;
314     dbname = NULL;
315     filename = NULL;
316     for(i = 2; i < argc; i++){
317     if(!dbname && argv[i][0] == '-'){
318     if(!strcmp(argv[i], "-cl")){
319     g_putopts |= ESTPDCLEAN;
320     } else {
321     usage();
322     }
323     } else if(!dbname){
324     dbname = argv[i];
325     } else if(!filename){
326     filename = argv[i];
327     } else {
328     usage();
329     }
330     }
331     if(!dbname) usage();
332     rv = procput(dbname, filename);
333     return rv;
334     }
335    
336    
337     /* parse arguments of the out command */
338     static int runout(int argc, char **argv){
339     char *dbname, *expr;
340     int i, id, rv;
341     dbname = NULL;
342     expr = NULL;
343     for(i = 2; i < argc; i++){
344     if(!dbname && argv[i][0] == '-'){
345     if(!strcmp(argv[i], "-cl")){
346     g_outopts |= ESTODCLEAN;
347     } else {
348     usage();
349     }
350     } else if(!dbname){
351     dbname = argv[i];
352     } else if(!expr){
353     expr = argv[i];
354     } else {
355     usage();
356     }
357     }
358     if(!dbname || !expr) usage();
359     if((id = atoi(expr)) > 0) expr = NULL;
360     rv = procout(dbname, id, expr);
361     return rv;
362     }
363    
364    
365     /* parse arguments of the get command */
366     static int runget(int argc, char **argv){
367     char *dbname, *expr, *attr;
368     int i, id, rv;
369     dbname = NULL;
370     expr = NULL;
371     attr = NULL;
372     for(i = 2; i < argc; i++){
373     if(!dbname && argv[i][0] == '-'){
374     usage();
375     } else if(!dbname){
376     dbname = argv[i];
377     } else if(!expr){
378     expr = argv[i];
379     } else if(!attr){
380     attr = argv[i];
381     } else {
382     usage();
383     }
384     }
385     if(!dbname || !expr) usage();
386     if((id = atoi(expr)) > 0) expr = NULL;
387     rv = procget(dbname, id, expr, attr);
388     return rv;
389     }
390    
391    
392     /* parse arguments of the list command */
393     static int runlist(int argc, char **argv){
394     char *dbname;
395     int i, rv;
396     dbname = NULL;
397     for(i = 2; i < argc; i++){
398     if(!dbname && argv[i][0] == '-'){
399     usage();
400     } else if(!dbname){
401     dbname = argv[i];
402     } else {
403     usage();
404     }
405     }
406     if(!dbname) usage();
407     rv = proclist(dbname);
408     return rv;
409     }
410    
411    
412     /* parse arguments of the uriid command */
413     static int runuriid(int argc, char **argv){
414     char *dbname, *uri;
415     int i, rv;
416     dbname = NULL;
417     uri = NULL;
418     for(i = 2; i < argc; i++){
419     if(!dbname && argv[i][0] == '-'){
420     usage();
421     } else if(!dbname){
422     dbname = argv[i];
423     } else if(!uri){
424     uri = argv[i];
425     } else {
426     usage();
427     }
428     }
429     if(!dbname || !uri) usage();
430     rv = procuriid(dbname, uri);
431     return rv;
432     }
433    
434    
435     /* parse arguments of the meta command */
436     static int runmeta(int argc, char **argv){
437     char *dbname, *mname, *mvalue;
438     int i, del, rv;
439     dbname = NULL;
440     mname = NULL;
441     mvalue = NULL;
442     del = FALSE;
443     for(i = 2; i < argc; i++){
444     if(!dbname && argv[i][0] == '-'){
445     usage();
446     } else if(!dbname){
447     dbname = argv[i];
448     } else if(!mname){
449     mname = argv[i];
450     } else if(!mvalue){
451     mvalue = argv[i];
452     } else {
453     usage();
454     }
455     }
456     if(!dbname) usage();
457     rv = procmeta(dbname, mname, mvalue);
458     return rv;
459     }
460    
461    
462     /* parse arguments of the inform command */
463     static int runinform(int argc, char **argv){
464     char *dbname;
465     int i, rv;
466     dbname = NULL;
467     for(i = 2; i < argc; i++){
468     if(!dbname && argv[i][0] == '-'){
469     usage();
470     } else if(!dbname){
471     dbname = argv[i];
472     } else {
473     usage();
474     }
475     }
476     if(!dbname) usage();
477     rv = procinform(dbname);
478     return rv;
479     }
480    
481    
482     /* parse arguments of the optimize command */
483     static int runoptimize(int argc, char **argv){
484     char *dbname;
485     int i, rv;
486     dbname = NULL;
487     for(i = 2; i < argc; i++){
488     if(!dbname && argv[i][0] == '-'){
489     if(!strcmp(argv[i], "-onp")){
490     g_optopts |= ESTOPTNOPURGE;
491     } else if(!strcmp(argv[i], "-ond")){
492     g_optopts |= ESTOPTNODBOPT;
493     } else {
494     usage();
495     }
496     } else if(!dbname){
497     dbname = argv[i];
498     } else {
499     usage();
500     }
501     }
502     if(!dbname) usage();
503     rv = procoptimize(dbname);
504     return rv;
505     }
506    
507    
508     /* parse arguments of the search command */
509     static int runsearch(int argc, char **argv){
510     CBDATUM *pbuf;
511     CBLIST *attrs;
512     char *dbname, *ord, *phrase, *tmp;
513 dpavlin 10 int i, max, opts, sim, rv;
514 dpavlin 2 dbname = NULL;
515     ord = NULL;
516     max = SEARCHMAX;
517 dpavlin 10 opts = 0;
518 dpavlin 2 sim = -1;
519     pbuf = cbdatumopen("", 0);
520     cbglobalgc(pbuf, (void (*)(void *))cbdatumclose);
521     attrs = cblistopen();
522     cbglobalgc(attrs, (void (*)(void *))cblistclose);
523     for(i = 2; i < argc; i++){
524     if(!dbname && argv[i][0] == '-'){
525     if(!strcmp(argv[i], "-ic")){
526     if(++i >= argc) usage();
527     g_inputcode = argv[i];
528     } else if(!strcmp(argv[i], "-gs")){
529 dpavlin 10 opts |= ESTCONDSURE;
530 dpavlin 2 } else if(!strcmp(argv[i], "-gf")){
531 dpavlin 10 opts |= ESTCONDFAST;
532 dpavlin 2 } else if(!strcmp(argv[i], "-ga")){
533 dpavlin 10 opts |= ESTCONDAGIT;
534 dpavlin 2 } else if(!strcmp(argv[i], "-ni")){
535 dpavlin 10 opts |= ESTCONDNOIDF;
536 dpavlin 2 } else if(!strcmp(argv[i], "-sf")){
537 dpavlin 10 opts |= ESTCONDSIMPLE;
538 dpavlin 2 } else if(!strcmp(argv[i], "-hs")){
539 dpavlin 10 opts |= ESTCONDSCFB;
540 dpavlin 2 } else if(!strcmp(argv[i], "-vu")){
541     g_viewmode = VM_URI;
542     } else if(!strcmp(argv[i], "-va")){
543     g_viewmode = VM_ATTR;
544     } else if(!strcmp(argv[i], "-vf")){
545     g_viewmode = VM_FULL;
546     } else if(!strcmp(argv[i], "-vs")){
547     g_viewmode = VM_SNIP;
548     } else if(!strcmp(argv[i], "-vh")){
549     g_viewmode = VM_HMRD;
550     } else if(!strcmp(argv[i], "-vx")){
551     g_viewmode = VM_XML;
552     } else if(!strcmp(argv[i], "-dd")){
553     g_viewmode = VM_DUMP;
554     } else if(!strcmp(argv[i], "-attr")){
555     if(++i >= argc) usage();
556     cblistpush(attrs, argv[i], -1);
557     } else if(!strcmp(argv[i], "-ord")){
558     if(++i >= argc) usage();
559     ord = argv[i];
560     } else if(!strcmp(argv[i], "-max")){
561     if(++i >= argc) usage();
562     max = atoi(argv[i]);
563     } else if(!strcmp(argv[i], "-sim")){
564     if(++i >= argc) usage();
565     sim = atoi(argv[i]);
566     } else {
567     usage();
568     }
569     } else if(!dbname){
570     dbname = argv[i];
571     } else {
572     if(cbdatumsize(pbuf) > 0) cbdatumcat(pbuf, " ", 1);
573     cbdatumcat(pbuf, argv[i], -1);
574     }
575     }
576     if(!dbname) usage();
577     if(!(phrase = est_iconv(cbdatumptr(pbuf), -1, g_inputcode, "UTF-8", NULL, NULL))){
578     printferror("%s: unsupported encoding\n", g_inputcode);
579     return 1;
580     }
581     cbstrtrim(phrase);
582     for(i = 0; i < cblistnum(attrs); i++){
583     if((tmp = est_iconv(cblistval(attrs, i, NULL), -1, g_inputcode, "UTF-8", NULL, NULL)) != NULL){
584     cblistover(attrs, i, tmp, -1);
585     free(tmp);
586     }
587     }
588 dpavlin 10 rv = procsearch(dbname, phrase, attrs, ord, max, opts, sim);
589 dpavlin 2 free(phrase);
590     return rv;
591     }
592    
593    
594     /* parse arguments of the gather command */
595     static int rungather(int argc, char **argv){
596     CBLIST *list;
597     const char *elem;
598     char *dbname, *filename;
599     int i, j, rv;
600 dpavlin 10 g_pathattrs = cblistopen();
601     cbglobalgc(g_pathattrs, (void (*)(void *))cblistclose);
602 dpavlin 2 g_xcmdmap = cbmapopenex(MINIBNUM);
603     cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose);
604     dbname = NULL;
605     filename = NULL;
606     g_inputcode = NULL;
607     for(i = 2; i < argc; i++){
608     if(!dbname && argv[i][0] == '-'){
609     if(!strcmp(argv[i], "-cl")){
610     g_putopts |= ESTPDCLEAN;
611     } else if(!strcmp(argv[i], "-fe")){
612     g_filefmt = FF_DRAFT;
613     } else if(!strcmp(argv[i], "-ft")){
614     g_filefmt = FF_TEXT;
615     } else if(!strcmp(argv[i], "-fh")){
616     g_filefmt = FF_HTML;
617     } else if(!strcmp(argv[i], "-fm")){
618     g_filefmt = FF_MIME;
619     } else if(!strcmp(argv[i], "-fx")){
620     if((i += 2) >= argc) usage();
621     list = cbsplit(argv[i-1], -1, ",");
622     for(j = 0; j < cblistnum(list); j++){
623     elem = cblistval(list, j, NULL);
624     if(elem[0] != '\0') cbmapput(g_xcmdmap, elem, -1, argv[i], -1, FALSE);
625     }
626     cblistclose(list);
627     } else if(!strcmp(argv[i], "-fz")){
628     g_filefmt = FF_NONE;
629     } else if(!strcmp(argv[i], "-fo")){
630     g_filtorig = TRUE;
631     } else if(!strcmp(argv[i], "-ic")){
632     if(++i >= argc) usage();
633     g_inputcode = argv[i];
634     } else if(!strcmp(argv[i], "-il")){
635     if(++i >= argc) usage();
636     g_inputlang = strtolang(argv[i]);
637     } else if(!strcmp(argv[i], "-pc")){
638     if(++i >= argc) usage();
639     g_pathcode = argv[i];
640     } else if(!strcmp(argv[i], "-pf")){
641     g_pathfull = TRUE;
642 dpavlin 10 } else if(!strcmp(argv[i], "-px")){
643     if(++i >= argc) usage();
644     cblistpush(g_pathattrs, argv[i], -1);
645 dpavlin 2 } else if(!strcmp(argv[i], "-apn")){
646     g_oextmodes |= ESTDBPERFNG;
647     } else if(!strcmp(argv[i], "-sd")){
648     g_stdate = TRUE;
649     } else if(!strcmp(argv[i], "-cm")){
650     g_chkmdate = TRUE;
651     } else if(!strcmp(argv[i], "-cs")){
652     if(++i >= argc) usage();
653     g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
654     } else {
655     usage();
656     }
657     } else if(!dbname){
658     dbname = argv[i];
659     } else if(!filename){
660     filename = argv[i];
661     } else {
662     usage();
663     }
664     }
665     if(!dbname || !filename) usage();
666     rv = procgather(dbname, filename);
667     return rv;
668     }
669    
670    
671     /* parse arguments of the purge command */
672     static int runpurge(int argc, char **argv){
673     char *dbname, *prefix;
674     int i, rv;
675     dbname = NULL;
676     prefix = NULL;
677     for(i = 2; i < argc; i++){
678     if(!dbname && argv[i][0] == '-'){
679     if(!strcmp(argv[i], "-cl")){
680     g_outopts |= ESTODCLEAN;
681     } else if(!strcmp(argv[i], "-fc")){
682     g_doforce = TRUE;
683     } else {
684     usage();
685     }
686     } else if(!dbname){
687     dbname = argv[i];
688     } else if(!prefix){
689     prefix = argv[i];
690     } else {
691     usage();
692     }
693     }
694     if(!dbname) usage();
695     rv = procpurge(dbname, prefix);
696     return rv;
697     }
698    
699    
700     /* parse arguments of the extkeys command */
701     static int runextkeys(int argc, char **argv){
702     char *dbname, *prefix;
703     int i, ni, rv;
704     dbname = NULL;
705     prefix = NULL;
706     ni = FALSE;
707     for(i = 2; i < argc; i++){
708     if(!dbname && argv[i][0] == '-'){
709     if(!strcmp(argv[i], "-fc")){
710     g_doforce = TRUE;
711     } else if(!strcmp(argv[i], "-ni")){
712     ni = TRUE;
713     } else if(!strcmp(argv[i], "-kn")){
714     if(++i >= argc) usage();
715     g_kwordnum = atoi(argv[i]);
716     } else {
717     usage();
718     }
719     } else if(!dbname){
720     dbname = argv[i];
721     } else if(!prefix){
722     prefix = argv[i];
723     } else {
724     usage();
725     }
726     }
727     if(!dbname || g_kwordnum < 1) usage();
728     rv = procextkeys(dbname, prefix, ni);
729     return rv;
730     }
731    
732    
733     /* parse arguments of the draft command */
734     static int rundraft(int argc, char **argv){
735     char *filename;
736     int i, rv;
737     filename = NULL;
738     g_filefmt = FF_DRAFT;
739     g_inputcode = NULL;
740     for(i = 2; i < argc; i++){
741     if(!filename && argv[i][0] == '-'){
742     if(!strcmp(argv[i], "-ft")){
743     g_filefmt = FF_TEXT;
744     } else if(!strcmp(argv[i], "-fh")){
745     g_filefmt = FF_HTML;
746     } else if(!strcmp(argv[i], "-fm")){
747     g_filefmt = FF_MIME;
748     } else if(!strcmp(argv[i], "-ic")){
749     if(++i >= argc) usage();
750     g_inputcode = argv[i];
751     } else if(!strcmp(argv[i], "-il")){
752     if(++i >= argc) usage();
753     g_inputlang = strtolang(argv[i]);
754     } else {
755     usage();
756     }
757     } else if(!filename){
758     filename = argv[i];
759     } else {
760     usage();
761     }
762     }
763     rv = procdraft(filename);
764     return rv;
765     }
766    
767    
768     /* parse arguments of the break command */
769     static int runbreak(int argc, char **argv){
770     char *filename;
771     int i, wt, rv;
772     filename = NULL;
773     wt = FALSE;
774     for(i = 2; i < argc; i++){
775     if(!filename && argv[i][0] == '-'){
776     if(!strcmp(argv[i], "-ic")){
777     if(++i >= argc) usage();
778     g_inputcode = argv[i];
779     } else if(!strcmp(argv[i], "-il")){
780     if(++i >= argc) usage();
781     g_inputlang = strtolang(argv[i]);
782     } else if(!strcmp(argv[i], "-apn")){
783     g_oextmodes |= ESTDBPERFNG;
784     } else if(!strcmp(argv[i], "-wt")){
785     wt = TRUE;
786     } else {
787     usage();
788     }
789     } else if(!filename){
790     filename = argv[i];
791     } else {
792     usage();
793     }
794     }
795     rv = procbreak(filename, wt);
796     return rv;
797     }
798    
799    
800     /* parse arguments of the randput command */
801     static int runrandput(int argc, char **argv){
802     char *dbname, *dnstr;
803     int i, dnum, rv;
804     dbname = NULL;
805     dnstr = NULL;
806     for(i = 2; i < argc; i++){
807     if(!dbname && argv[i][0] == '-'){
808     if(!strcmp(argv[i], "-ren")){
809     g_rdmode = RD_ENG;
810     } else if(!strcmp(argv[i], "-rla")){
811     g_rdmode = RD_LAT;
812     } else if(!strcmp(argv[i], "-reu")){
813     g_rdmode = RD_EURO;
814     } else if(!strcmp(argv[i], "-ror")){
815     g_rdmode = RD_ORI;
816     } else if(!strcmp(argv[i], "-rjp")){
817     g_rdmode = RD_JPN;
818     } else if(!strcmp(argv[i], "-rch")){
819     g_rdmode = RD_CHAO;
820     } else if(!strcmp(argv[i], "-cs")){
821     if(++i >= argc) usage();
822     g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
823     } else {
824     usage();
825     }
826     } else if(!dbname){
827     dbname = argv[i];
828     } else if(!dnstr){
829     dnstr = argv[i];
830     } else {
831     usage();
832     }
833     }
834     if(!dbname || !dnstr) usage();
835     if((dnum = atoi(dnstr)) < 1) usage();
836     rv = procrandput(dbname, dnum);
837     return rv;
838     }
839    
840    
841     /* parse arguments of the wicked command */
842     static int runwicked(int argc, char **argv){
843     char *dbname, *dnstr;
844     int i, dnum, rv;
845     dbname = NULL;
846     dnstr = NULL;
847     for(i = 2; i < argc; i++){
848     if(!dbname && argv[i][0] == '-'){
849     usage();
850     } else if(!dbname){
851     dbname = argv[i];
852     } else if(!dnstr){
853     dnstr = argv[i];
854     } else {
855     usage();
856     }
857     }
858     if(!dbname || !dnstr) usage();
859     if((dnum = atoi(dnstr)) < 1) usage();
860     rv = procwicked(dbname, dnum);
861     return rv;
862     }
863    
864    
865     /* parse arguments of the regression command */
866     static int runregression(int argc, char **argv){
867     char *dbname;
868     int i, rv;
869     dbname = NULL;
870     for(i = 2; i < argc; i++){
871     if(!dbname && argv[i][0] == '-'){
872     usage();
873     } else if(!dbname){
874     dbname = argv[i];
875     } else {
876     usage();
877     }
878     }
879     if(!dbname) usage();
880     rv = procregression(dbname);
881     return rv;
882     }
883    
884    
885     /* perform the put command */
886     static int procput(const char *dbname, const char *filename){
887     ESTDB *db;
888     ESTDOC *doc;
889     const char *uri;
890     char *draft;
891     int ecode;
892     if(!(draft = cbreadfile(filename, NULL))){
893     printferror("%s: could not open", filename ? filename : "(stdin)");
894     return 1;
895     }
896     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT, &ecode))){
897     printferror("%s: %s", dbname, est_err_msg(ecode));
898     free(draft);
899     return 1;
900     }
901     est_db_set_informer(db, dbinform);
902     doc = est_doc_new_from_draft(draft);
903     if(!est_db_put_doc(db, doc, g_putopts)){
904     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
905     est_doc_delete(doc);
906     est_db_close(db, &ecode);
907     free(draft);
908     return 1;
909     }
910     if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
911     printfinfo("%d (%s): registered", est_doc_id(doc), uri);
912     est_doc_delete(doc);
913     if(!est_db_close(db, &ecode)){
914     printferror("%s: %s", dbname, est_err_msg(ecode));
915     free(draft);
916     return 1;
917     }
918     free(draft);
919     return 0;
920     }
921    
922    
923     /* perform the out command */
924     static int procout(const char *dbname, int id, const char *expr){
925     ESTDB *db;
926     int ecode;
927     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
928     printferror("%s: %s", dbname, est_err_msg(ecode));
929     return 1;
930     }
931     est_db_set_informer(db, dbinform);
932     if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
933     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
934     est_db_close(db, &ecode);
935     return 1;
936     }
937     if(!est_db_out_doc(db, id, g_outopts)){
938     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
939     est_db_close(db, &ecode);
940     return 1;
941     }
942     printfinfo("%d: deleted", id);
943     if(!est_db_close(db, &ecode)){
944     printferror("%s: %s", dbname, est_err_msg(ecode));
945     return 1;
946     }
947     return 0;
948     }
949    
950    
951     /* perform the get command */
952     static int procget(const char *dbname, int id, const char *expr, const char *attr){
953     ESTDB *db;
954     ESTDOC *doc;
955     char *draft;
956     int ecode;
957     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
958     printferror("%s: %s", dbname, est_err_msg(ecode));
959     return 1;
960     }
961     if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
962     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
963     est_db_close(db, &ecode);
964     return 1;
965     }
966     if(attr){
967     if(!(draft = est_db_get_doc_attr(db, id, attr))){
968     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
969     est_db_close(db, &ecode);
970     return 1;
971     }
972     printf("%s\n", draft);
973     free(draft);
974     } else {
975     if(!(doc = est_db_get_doc(db, id, 0))){
976     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
977     est_db_close(db, &ecode);
978     return 1;
979     }
980     draft = est_doc_dump_draft(doc);
981     printf("%s", draft);
982     free(draft);
983     est_doc_delete(doc);
984     }
985     if(!est_db_close(db, &ecode)){
986     printferror("%s: %s", dbname, est_err_msg(ecode));
987     return 1;
988     }
989     return 0;
990     }
991    
992    
993     /* perform the list command */
994     static int proclist(const char *dbname){
995     ESTDB *db;
996     ESTDOC *doc;
997     const char *vbuf;
998     int ecode, id;
999     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1000     printferror("%s: %s", dbname, est_err_msg(ecode));
1001     return 1;
1002     }
1003     if(!est_db_iter_init(db)){
1004     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1005     est_db_close(db, &ecode);
1006     return 1;
1007     }
1008     while((id = est_db_iter_next(db)) > 0){
1009     if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1010     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1011     printf("%d\t%s\n", id, vbuf);
1012     est_doc_delete(doc);
1013     }
1014     }
1015     if(!est_db_close(db, &ecode)){
1016     printferror("%s: %s", dbname, est_err_msg(ecode));
1017     return 1;
1018     }
1019     return 0;
1020     }
1021    
1022    
1023     /* perform the uriid command */
1024     static int procuriid(const char *dbname, const char *uri){
1025     ESTDB *db;
1026     int ecode, id;
1027     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1028     printferror("%s: %s", dbname, est_err_msg(ecode));
1029     return 1;
1030     }
1031     if((id = est_db_uri_to_id(db, uri)) == -1){
1032     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1033     est_db_close(db, &ecode);
1034     return 1;
1035     }
1036     printf("%d\n", id);
1037     if(!est_db_close(db, &ecode)){
1038     printferror("%s: %s", dbname, est_err_msg(ecode));
1039     return 1;
1040     }
1041     return 0;
1042     }
1043    
1044    
1045     /* perform the meta command */
1046     static int procmeta(const char *dbname, const char *mname, const char *mvalue){
1047     ESTDB *db;
1048     CBLIST *names;
1049     char *vbuf;
1050     int i, ecode;
1051     if(!(db = est_db_open(dbname, mvalue ? (ESTDBWRITER | ESTDBCREAT) : (ESTDBREADER | ESTDBLCKNB),
1052     &ecode))){
1053     printferror("%s: %s", dbname, est_err_msg(ecode));
1054     return 1;
1055     }
1056     if(mname){
1057     if(mvalue){
1058     est_db_add_meta(db, mname, mvalue[0] != '\0' ? mvalue : NULL);
1059     } else {
1060     if((vbuf = est_db_meta(db, mname)) != NULL){
1061     printf("%s\n", vbuf);
1062     free(vbuf);
1063     }
1064     }
1065     } else {
1066     names = est_db_meta_names(db);
1067     for(i = 0; i < cblistnum(names); i++){
1068     printf("%s\n", cblistval(names, i, NULL));
1069     }
1070     cblistclose(names);
1071     }
1072     if(!est_db_close(db, &ecode)){
1073     printferror("%s: %s", dbname, est_err_msg(ecode));
1074     return 1;
1075     }
1076     return 0;
1077     }
1078    
1079    
1080     /* perform the inform command */
1081     static int procinform(const char *dbname){
1082     ESTDB *db;
1083     int ecode;
1084     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1085     printferror("%s: %s", dbname, est_err_msg(ecode));
1086     return 1;
1087     }
1088     printf("number of documents: %d\n", est_db_doc_num(db));
1089     printf("number of words: %d\n", est_db_word_num(db));
1090     printf("file size: %.0f\n", est_db_size(db));
1091     if(!est_db_close(db, &ecode)){
1092     printferror("%s: %s", dbname, est_err_msg(ecode));
1093     return 1;
1094     }
1095     return 0;
1096     }
1097    
1098    
1099     /* perform the optimize command */
1100     static int procoptimize(const char *dbname){
1101     ESTDB *db;
1102     char path[URIBUFSIZ];
1103     int ecode;
1104     time_t curtime;
1105     curtime = time(NULL);
1106     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1107     printferror("%s: %s", dbname, est_err_msg(ecode));
1108     return 1;
1109     }
1110     est_db_set_informer(db, dbinform);
1111     sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1112     unlink(path);
1113     if(!est_db_optimize(db, g_optopts)){
1114     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1115     est_db_close(db, &ecode);
1116     return 1;
1117     }
1118     if(!est_db_close(db, &ecode)){
1119     printferror("%s: %s", dbname, est_err_msg(ecode));
1120     return 1;
1121     }
1122     curtime = time(NULL) - curtime;
1123     printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1124     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1125     return 0;
1126     }
1127    
1128    
1129     /* perform the search command */
1130     static int procsearch(const char *dbname, const char *phrase,
1131 dpavlin 10 const CBLIST *attrs, const char *ord, int max, int opts, int sim){
1132 dpavlin 2 ESTDB *db;
1133     ESTCOND *cond;
1134     ESTDOC *doc;
1135     CURIA *kwdb;
1136     CBDATUM *pbuf;
1137     CBMAP *svmap, *hints, *kwords;
1138     CBLIST *names, *words, *lines;
1139     const char *kbuf, *vbuf, *line;
1140     char *draft, path[URIBUFSIZ], numbuf[NUMBUFSIZ], *word, *pv;
1141     int i, j, ecode, ksiz, vsiz, *res, rnum, id, sc, fin, cnt;
1142     double curtime;
1143     if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1144     printferror("%s: %s", dbname, est_err_msg(ecode));
1145     return 1;
1146     }
1147     sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1148     if((kwdb = cropen(path, CR_OREADER, -1, -1)) != NULL)
1149     est_db_set_vectorizer(db, vectorizer, kwdb);
1150     cond = est_cond_new();
1151     if(sim > 0){
1152     svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL;
1153     if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){
1154 dpavlin 10 svmap = est_db_etch_doc((opts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM);
1155 dpavlin 2 est_doc_delete(doc);
1156     }
1157     if(svmap){
1158     pbuf = cbdatumopen(ESTOPSIMILAR, -1);
1159     cbmapiterinit(svmap);
1160     while((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
1161     vbuf = cbmapget(svmap, kbuf, ksiz, &vsiz);
1162     cbdatumcat(pbuf, " WITH ", -1);
1163     cbdatumcat(pbuf, vbuf, vsiz);
1164     cbdatumcat(pbuf, " ", 1);
1165     cbdatumcat(pbuf, kbuf, ksiz);
1166     }
1167     est_cond_set_phrase(cond, cbdatumptr(pbuf));
1168     cbdatumclose(pbuf);
1169     cbmapclose(svmap);
1170     }
1171     } else {
1172     while(*phrase > '\0' && *phrase <= ' '){
1173     phrase++;
1174     }
1175     if(phrase[0] != '\0' || cblistnum(attrs) < 1) est_cond_set_phrase(cond, phrase);
1176     }
1177     for(i = 0; i < cblistnum(attrs); i++){
1178     est_cond_add_attr(cond, cblistval(attrs, i, NULL));
1179     }
1180     if(ord) est_cond_set_order(cond, ord);
1181     if(max >= 0) est_cond_set_max(cond, max);
1182 dpavlin 10 est_cond_set_options(cond, opts);
1183 dpavlin 2 hints = cbmapopenex(MINIBNUM);
1184     curtime = est_gettimeofday();
1185     res = est_db_search(db, cond, &rnum, hints);
1186     curtime = est_gettimeofday() - curtime;
1187     if(g_viewmode == VM_XML){
1188     xmlprintf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
1189     xmlprintf("<estresult version=\"%@\">\n", est_version);
1190     xmlprintf("<meta>\n");
1191     xmlprintf("<hit number=\"%@\"/>\n", cbmapget(hints, "", 0, NULL));
1192     cbmapiterinit(hints);
1193     while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1194     if(kbuf[0] == '\0') continue;
1195     vbuf = cbmapget(hints, kbuf, -1, NULL);
1196     xmlprintf("<hit key=\"%@\" number=\"%@\"/>\n", kbuf, vbuf);
1197     }
1198     xmlprintf("<time time=\"%.3f\"/>\n", curtime / 1000.0);
1199     xmlprintf("<total documents=\"%d\" words=\"%d\"/>\n",
1200     est_db_doc_num(db), est_db_word_num(db));
1201     xmlprintf("</meta>\n");
1202     } else {
1203     printf("%s\n", est_border_str());
1204     printf("VERSION\t%s\n", _EST_PROTVER);
1205     printf("NODE\tlocal\n");
1206     printf("HIT\t%s\n", cbmapget(hints, "", 0, NULL));
1207     cbmapiterinit(hints);
1208     cnt = 1;
1209     while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1210     if(kbuf[0] == '\0') continue;
1211     vbuf = cbmapget(hints, kbuf, -1, NULL);
1212     printf("HINT#%d\t%s\t%s\n", cnt, kbuf, vbuf);
1213     cnt++;
1214     }
1215     printf("TIME\t%.3f\n", curtime / 1000.0);
1216     printf("DOCNUM\t%d\n", est_db_doc_num(db));
1217     printf("WORDNUM\t%d\n", est_db_word_num(db));
1218     switch(g_viewmode){
1219     case VM_ID:
1220     printf("VIEW\tID\n");
1221     break;
1222     case VM_URI:
1223     printf("VIEW\tURI\n");
1224     break;
1225     case VM_ATTR:
1226     printf("VIEW\tATTRIBUTE\n");
1227     break;
1228     case VM_FULL:
1229     printf("VIEW\tFULL\n");
1230     break;
1231     case VM_SNIP:
1232     printf("VIEW\tSNIPPET\n");
1233     break;
1234     case VM_HMRD:
1235     printf("VIEW\tHUMAN\n");
1236     break;
1237     }
1238     printf("\n");
1239     if(g_viewmode == VM_ID || g_viewmode == VM_URI ||
1240     g_viewmode == VM_HMRD || g_viewmode == VM_DUMP) printf("%s\n", est_border_str());
1241     }
1242     for(i = 0; i < rnum ; i++){
1243     id = res[i];
1244     sc = est_cond_score(cond, i);
1245     switch(g_viewmode){
1246     case VM_URI:
1247     if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1248     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1249     printf("%d\t%s\n", id, vbuf);
1250     est_doc_delete(doc);
1251     }
1252     break;
1253     case VM_ATTR:
1254     if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1255     if(sc >= 0){
1256     sprintf(numbuf, "%d", sc);
1257     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1258     }
1259     printf("%s\n", est_border_str());
1260     names = est_doc_attr_names(doc);
1261     for(j = 0; j < cblistnum(names); j++){
1262     kbuf = cblistval(names, j, NULL);
1263     vbuf = est_doc_attr(doc, kbuf);
1264     printf("%s=%s\n", kbuf, vbuf);
1265     }
1266     cblistclose(names);
1267     est_doc_delete(doc);
1268     }
1269     printf("\n");
1270     break;
1271     case VM_FULL:
1272     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1273     if(sc >= 0){
1274     sprintf(numbuf, "%d", sc);
1275     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1276     }
1277     printf("%s\n", est_border_str());
1278     draft = est_doc_dump_draft(doc);
1279     printf("%s", draft);
1280     free(draft);
1281     est_doc_delete(doc);
1282     }
1283     break;
1284     case VM_SNIP:
1285     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1286     if(sc >= 0){
1287     sprintf(numbuf, "%d", sc);
1288     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1289     }
1290     printf("%s\n", est_border_str());
1291     names = est_doc_attr_names(doc);
1292     for(j = 0; j < cblistnum(names); j++){
1293     kbuf = cblistval(names, j, NULL);
1294     vbuf = est_doc_attr(doc, kbuf);
1295     printf("%s=%s\n", kbuf, vbuf);
1296     }
1297     cblistclose(names);
1298     kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1299     if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1300     if(cbmaprnum(kwords) > 0){
1301     printf("%s=", DATTRKWORDS);
1302     cbmapiterinit(kwords);
1303     for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1304     if(j > 0) printf(" ");
1305     printf("%s %s", kbuf, cbmapget(kwords, kbuf, -1, NULL));
1306     }
1307     printf("\n");
1308     }
1309     cbmapclose(kwords);
1310     printf("\n");
1311     words = cbmapkeys(hints);
1312     draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1313     printf("%s", draft);
1314     free(draft);
1315     cblistclose(words);
1316     est_doc_delete(doc);
1317     }
1318     break;
1319     case VM_HMRD:
1320     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1321     if(sc >= 0){
1322     sprintf(numbuf, "%d", sc);
1323     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1324     }
1325     printf("\n");
1326     if((vbuf = est_doc_attr(doc, ESTDATTRURI)) != NULL) printf("URI: %s\n", vbuf);
1327     if((vbuf = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) printf("Title: %s\n", vbuf);
1328     printf(" ");
1329     words = cbmapkeys(hints);
1330     draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1331     lines = cbsplit(draft, -1, "\n");
1332     fin = TRUE;
1333     for(j = 0; j < cblistnum(lines); j++){
1334     line = cblistval(lines, j, NULL);
1335     if(line[0] != '\0'){
1336     word = cbmemdup(line, -1);
1337     if((pv = strchr(word, '\t')) != NULL) *pv = '\0';
1338     printf("%s", word);
1339     free(word);
1340     fin = TRUE;
1341     } else if(fin){
1342     printf(" ... ");
1343     fin = FALSE;
1344     }
1345     }
1346     cblistclose(lines);
1347     free(draft);
1348     cblistclose(words);
1349     printf("\n\n");
1350     est_doc_delete(doc);
1351     }
1352     break;
1353     case VM_XML:
1354     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1355     if(sc >= 0){
1356     sprintf(numbuf, "%d", sc);
1357     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1358     }
1359     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1360     xmlprintf("<document id=\"%d\" uri=\"%@\">\n", id, vbuf);
1361     names = est_doc_attr_names(doc);
1362     for(j = 0; j < cblistnum(names); j++){
1363     kbuf = cblistval(names, j, NULL);
1364     if(!strcmp(kbuf, ESTDATTRID) || !strcmp(kbuf, ESTDATTRURI)) continue;
1365     vbuf = est_doc_attr(doc, kbuf);
1366     xmlprintf("<attribute name=\"%@\" value=\"%@\"/>\n", kbuf, vbuf);
1367     }
1368     cblistclose(names);
1369     kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1370     if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1371     if(cbmaprnum(kwords) > 0){
1372     xmlprintf("<vector>");
1373     cbmapiterinit(kwords);
1374     for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1375     xmlprintf("<element key=\"%@\" number=\"%@\"/>",
1376     kbuf, cbmapget(kwords, kbuf, -1, NULL));
1377     }
1378     xmlprintf("</vector>\n");
1379     }
1380     cbmapclose(kwords);
1381     words = cbmapkeys(hints);
1382     draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1383     lines = cbsplit(draft, -1, "\n");
1384     fin = TRUE;
1385     xmlprintf("<snippet>");
1386     for(j = 0; j < cblistnum(lines); j++){
1387     line = cblistval(lines, j, NULL);
1388     if(line[0] != '\0'){
1389     word = cbmemdup(line, -1);
1390     if((pv = strchr(word, '\t')) != NULL){
1391     *pv = '\0';
1392     pv++;
1393     xmlprintf("<key normal=\"%@\">%@</key>", pv, word);
1394     } else {
1395     xmlprintf("%@", word);
1396     }
1397     free(word);
1398     fin = TRUE;
1399     } else if(fin){
1400     xmlprintf("<delimiter/>");
1401     fin = FALSE;
1402     }
1403     }
1404     xmlprintf("</snippet>\n");
1405     cblistclose(lines);
1406     free(draft);
1407     cblistclose(words);
1408     xmlprintf("</document>\n");
1409     est_doc_delete(doc);
1410     }
1411     break;
1412     case VM_DUMP:
1413     if((doc = est_db_get_doc(db, id, 0)) != NULL){
1414     if(sc >= 0){
1415     sprintf(numbuf, "%d", sc);
1416     est_doc_add_attr(doc, DATTRSCORE, numbuf);
1417     }
1418     if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1419     sprintf(path, "%08d%cest", id, ESTEXTCHR);
1420     printf("%s\t%s\n", path, vbuf);
1421     draft = est_doc_dump_draft(doc);
1422     if(!(cbwritefile(path, draft, -1))) printferror("%s: could not open", path);
1423     free(draft);
1424     est_doc_delete(doc);
1425     }
1426     break;
1427     default:
1428     printf("%d\n", id);
1429     break;
1430     }
1431     }
1432     if(g_viewmode == VM_XML){
1433     xmlprintf("</estresult>\n");
1434     } else {
1435     printf("%s:END\n", est_border_str());
1436     }
1437     free(res);
1438     cbmapclose(hints);
1439     est_cond_delete(cond);
1440     if(kwdb) crclose(kwdb);
1441     if(!est_db_close(db, &ecode)){
1442     printferror("%s: %s", dbname, est_err_msg(ecode));
1443     return 1;
1444     }
1445     return 0;
1446     }
1447    
1448    
1449     /* perform the gather command */
1450     static int procgather(const char *dbname, const char *filename){
1451     ESTDB *db;
1452 dpavlin 10 CBLIST *list, *clist, *attrs;
1453 dpavlin 2 FILE *ifp;
1454     const char *tmp;
1455     char *line, *path;
1456     int i, err, ecode;
1457     time_t curtime;
1458     struct stat sbuf;
1459     curtime = time(NULL);
1460     err = FALSE;
1461     if(stat(filename, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)){
1462     printfinfo("reading list from the directory: %s", filename);
1463     if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1464     est_db_set_informer(db, dbinform);
1465     if(g_cachesize > 0){
1466     if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1467     est_db_set_cache_size(db, g_cachesize, -1, -1);
1468     }
1469     list = cblistopen();
1470     cblistunshift(list, filename, -1);
1471     while((line = cblistshift(list, NULL)) != NULL){
1472     if(stat(line, &sbuf) != -1 && S_ISDIR(sbuf.st_mode) && (clist = cbdirlist(line)) != NULL){
1473     cblistsort(clist);
1474     for(i = cblistnum(clist) - 1; i >= 0; i--){
1475     tmp = cblistval(clist, i, NULL);
1476     if(!strcmp(tmp, ESTCDIRSTR) || !strcmp(tmp, ESTPDIRSTR)) continue;
1477     path = cbsprintf("%s%c%s", line, ESTPATHCHR, tmp);
1478     cblistunshift(list, path, -1);
1479     free(path);
1480     }
1481     cblistclose(clist);
1482     } else {
1483 dpavlin 10 if(!doputdoc(db, line, NULL)){
1484 dpavlin 2 printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1485     err = TRUE;
1486     }
1487     }
1488     free(line);
1489     if(err || g_sigterm) break;
1490     }
1491     cblistclose(list);
1492     if(!est_db_close(db, &ecode)){
1493     printferror("%s: %s", dbname, est_err_msg(ecode));
1494     err = TRUE;
1495     }
1496     } else {
1497     printferror("%s: %s", dbname, est_err_msg(ecode));
1498     err = TRUE;
1499     }
1500     } else {
1501     if(!strcmp(filename, "-")){
1502     ifp = stdin;
1503     printfinfo("reading list from the standard input", filename);
1504     } else if((ifp = fopen(filename, "rb")) != NULL){
1505     printfinfo("reading list from the file: %s", filename);
1506     } else {
1507     printferror("%s: could not open", filename);
1508     return 1;
1509     }
1510     if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1511     est_db_set_informer(db, dbinform);
1512     if(g_cachesize > 0){
1513     if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1514     est_db_set_cache_size(db, g_cachesize, -1, -1);
1515     }
1516     while((line = fgetl(ifp)) != NULL){
1517 dpavlin 10 if(line[0] == '\0'){
1518     free(line);
1519     continue;
1520 dpavlin 2 }
1521 dpavlin 10 if(cblistnum(g_pathattrs) > 0){
1522     attrs = cbsplit(line, -1, "\t");
1523     path = cblistshift(attrs, NULL);
1524     if(!doputdoc(db, path, attrs)){
1525     printferror("%s: %s", path, est_err_msg(est_db_error(db)));
1526     err = TRUE;
1527     }
1528     free(path);
1529     cblistclose(attrs);
1530     } else {
1531     if(!doputdoc(db, line, NULL)){
1532     printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1533     err = TRUE;
1534     }
1535     }
1536 dpavlin 2 free(line);
1537     if(err || g_sigterm) break;
1538     }
1539     if(!est_db_close(db, &ecode)){
1540     printferror("%s: %s", dbname, est_err_msg(ecode));
1541     err = TRUE;
1542     }
1543     } else {
1544     printferror("%s: %s", dbname, est_err_msg(ecode));
1545     err = TRUE;
1546     }
1547     if(ifp != stdin) fclose(ifp);
1548     }
1549     curtime = time(NULL) - curtime;
1550     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1551     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1552     return err ? 1 : 0;
1553     }
1554    
1555    
1556     /* perform the purge command */
1557     static int procpurge(const char *dbname, const char *prefix){
1558     ESTDB *db;
1559     ESTCOND *cond;
1560     ESTDOC *doc;
1561     const char *luri;
1562     char *attr, *path;
1563     int i, ecode, err, *res, rnum;
1564     time_t curtime;
1565     struct stat sbuf;
1566     curtime = time(NULL);
1567     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1568     printferror("%s: %s", dbname, est_err_msg(ecode));
1569     return 1;
1570     }
1571     est_db_set_informer(db, dbinform);
1572     cond = est_cond_new();
1573     attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1574     est_cond_add_attr(cond, attr);
1575     res = est_db_search(db, cond, &rnum, NULL);
1576     err = FALSE;
1577     for(i = 0; i < rnum; i++){
1578     if(!(doc = est_db_get_doc(db, res[i], ESTGDNOTEXT))) continue;
1579     if((luri = est_doc_attr(doc, DATTRLPATH)) != NULL){
1580     if(g_doforce){
1581     if(est_db_out_doc(db, res[i], g_outopts)){
1582     printfinfo("%d (%s): deleted", res[i], luri);
1583     } else {
1584     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1585     err = TRUE;
1586     }
1587     } else if((path = urltopath(luri)) != NULL){
1588     if(stat(path, &sbuf) != -1){
1589     printfinfo("%s: passed", luri);
1590     } else {
1591     if(est_db_out_doc(db, res[i], g_outopts)){
1592     printfinfo("%d (%s): deleted", res[i], luri);
1593     } else {
1594     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1595     err = TRUE;
1596     }
1597     }
1598     } else {
1599     printfinfo("%s: ignored", luri);
1600     }
1601     } else {
1602     printfinfo("(%d): ignored", res[i]);
1603     }
1604     est_doc_delete(doc);
1605     if(err || g_sigterm) break;
1606     }
1607     free(res);
1608     est_cond_delete(cond);
1609     free(attr);
1610     if(!est_db_close(db, &ecode)){
1611     printferror("%s: %s", dbname, est_err_msg(ecode));
1612     return 1;
1613     }
1614     curtime = time(NULL) - curtime;
1615     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1616     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1617     return err ? 1 : 0;
1618     }
1619    
1620    
1621     /* perform the extkeys command */
1622     static int procextkeys(const char *dbname, const char *prefix, int ni){
1623     ESTDB *db;
1624     ESTCOND *cond;
1625     ESTDOC *doc;
1626     CURIA *kwdb;
1627     CBMAP *kwords;
1628     const char *uri;
1629     char path[URIBUFSIZ], *attr, *mbuf;
1630     int i, ecode, err, *res, rnum, msiz;
1631     time_t curtime;
1632     curtime = time(NULL);
1633     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1634     printferror("%s: %s", dbname, est_err_msg(ecode));
1635     return 1;
1636     }
1637     est_db_set_informer(db, dbinform);
1638     if(!ni && (!prefix || prefix[0] == '\0')) est_db_fill_key_cache(db);
1639     sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1640     if(!(kwdb = cropen(path, CR_OWRITER | CR_OCREAT, KWDBBNUM, KWDBDNUM))){
1641     printferror("%s: the keyword database has some errors", dbname);
1642     est_db_close(db, &ecode);
1643     return 1;
1644     }
1645     crsetalign(kwdb, -4);
1646     cond = est_cond_new();
1647     attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1648     est_cond_add_attr(cond, attr);
1649     res = est_db_search(db, cond, &rnum, NULL);
1650     err = FALSE;
1651     for(i = 0; i < rnum; i++){
1652     if(!g_doforce && crvsiz(kwdb, (char *)&(res[i]), sizeof(int)) > 0){
1653     printfinfo("%d: passed", res[i]);
1654     continue;
1655     }
1656     if(!(doc = est_db_get_doc(db, res[i], 0))) continue;
1657     if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
1658     kwords = est_db_etch_doc(ni ? NULL : db, doc, g_kwordnum);
1659     mbuf = cbmapdump(kwords, &msiz);
1660     fflush(stdout);
1661     if(crput(kwdb, (char *)&(res[i]), sizeof(int), mbuf, msiz, CR_DOVER)){
1662     printfinfo("%d (%s): extracted", res[i], uri);
1663     } else {
1664     printferror("%s: the keyword database has some errors", dbname);
1665     err = TRUE;
1666     }
1667     free(mbuf);
1668     cbmapclose(kwords);
1669     est_doc_delete(doc);
1670     if(err || g_sigterm) break;
1671     }
1672     free(res);
1673     est_cond_delete(cond);
1674     free(attr);
1675     if(!crclose(kwdb)){
1676     printferror("%s: the keyword database has some errors", dbname);
1677     err = TRUE;
1678     }
1679     if(!est_db_close(db, &ecode)){
1680     printferror("%s: %s", dbname, est_err_msg(ecode));
1681     return 1;
1682     }
1683     curtime = time(NULL) - curtime;
1684     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1685     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1686     return err ? 1 : 0;
1687     }
1688    
1689    
1690     /* perform the draft command */
1691     static int procdraft(const char *filename){
1692     ESTDOC *doc;
1693     char *buf, *draft;
1694     int size;
1695     if(!(buf = cbreadfile(filename, &size))){
1696     printferror("%s: could not open", filename ? filename : "(stdin)");
1697     return 1;
1698     }
1699     switch(g_filefmt){
1700     case FF_TEXT:
1701     doc = est_doc_new_from_text(buf, size, g_inputcode, g_inputlang);
1702     break;
1703     case FF_HTML:
1704     doc = est_doc_new_from_html(buf, size, g_inputcode, g_inputlang);
1705     break;
1706     case FF_MIME:
1707     doc = est_doc_new_from_mime(buf, size, g_inputcode, g_inputlang);
1708     break;
1709     default:
1710     doc = est_doc_new_from_draft_enc(buf, size, g_inputcode);
1711     break;
1712     }
1713     draft = est_doc_dump_draft(doc);
1714     printf("%s", draft);
1715     free(draft);
1716     est_doc_delete(doc);
1717     free(buf);
1718     return 0;
1719     }
1720    
1721    
1722     /* perform the break command */
1723     static int procbreak(const char *filename, int wt){
1724     CBLIST *words;
1725     char *str, *phrase;
1726     int i;
1727     if(filename && filename[0] == '@'){
1728     str = cbmemdup(filename + 1, -1);
1729     } else if(!(str = cbreadfile(filename, NULL))){
1730     printferror("%s: could not open", filename ? filename : "(stdin)");
1731     return 1;
1732     }
1733     if(!(phrase = est_iconv(str, -1, g_inputcode, "UTF-8", NULL, NULL))){
1734     printferror("%s: unsupported encoding\n", g_inputcode);
1735     free(str);
1736     return 1;
1737     }
1738     g_inputcode = NULL;
1739     words = cblistopen();
1740     if(g_oextmodes & ESTDBPERFNG){
1741     est_break_text_perfng(phrase, words, TRUE, wt);
1742     } else {
1743     est_break_text(phrase, words, TRUE, wt);
1744     }
1745     for(i = 0; i < cblistnum(words); i++){
1746     printf("%s\n", cblistval(words, i, NULL));
1747     }
1748     cblistclose(words);
1749     free(phrase);
1750     free(str);
1751     return 0;
1752     }
1753    
1754    
1755     /* perform the randput command */
1756     static int procrandput(const char *dbname, int dnum){
1757     ESTDB *db;
1758     ESTDOC *doc;
1759     const char *mode;
1760     char uri[URIBUFSIZ];
1761     int i, ecode, err;
1762     time_t curtime;
1763     curtime = time(NULL);
1764     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1765     printferror("%s: %s", dbname, est_err_msg(ecode));
1766     return 1;
1767     }
1768     est_db_set_informer(db, dbinform);
1769     if(g_cachesize > 0){
1770     if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1771     est_db_set_cache_size(db, g_cachesize, -1, -1);
1772     }
1773     err = FALSE;
1774     for(i = 0; i < dnum; i++){
1775     doc = est_doc_new_from_chaos(RDOCCNUM, RDOCSNUM, g_rdmode);
1776     sprintf(uri, "file:///tmp/randput-%08d-%05d.est", i + 1, getpid());
1777     est_doc_add_attr(doc, ESTDATTRURI, uri);
1778     if(est_db_put_doc(db, doc, 0)){
1779     if(!(mode = est_doc_attr(doc, "mode"))) mode = "unknown";
1780     printfinfo("%d (%s) (%s): registered", est_doc_id(doc), uri, mode);
1781     } else {
1782     printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1783     err = TRUE;
1784     }
1785     est_doc_delete(doc);
1786     if(err || g_sigterm) break;
1787     }
1788     if(!est_db_close(db, &ecode)){
1789     printferror("%s: %s", dbname, est_err_msg(ecode));
1790     return 1;
1791     }
1792     curtime = time(NULL) - curtime;
1793     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1794     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1795     return err ? 1 : 0;
1796     }
1797    
1798    
1799     /* perform the wicked command */
1800     static int procwicked(const char *dbname, int dnum){
1801     ESTDB *db;
1802     ESTDOC *doc;
1803     ESTCOND *cond;
1804     CBLIST *words;
1805     char uri[URIBUFSIZ], *oper, *value, *first, *second, *phrase;
1806     int i, j, ecode, err, *res, rnum;
1807     double rnd;
1808     time_t curtime;
1809     curtime = time(NULL);
1810     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1811     printferror("%s: %s", dbname, est_err_msg(ecode));
1812     return 1;
1813     }
1814     est_db_set_informer(db, dbinform);
1815     est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1816     est_db_set_special_cache(db, ESTDATTRURI, 128);
1817     err = FALSE;
1818     for(i = 0; i < dnum; i++){
1819     rnd = est_random();
1820     if((int)(rnd * INT_MAX) % dnum < 5){
1821     rnd = est_random();
1822     if(rnd < 0.3){
1823     if(!est_db_close(db, &ecode)){
1824     printferror("%s: %s", dbname, est_err_msg(ecode));
1825     return 1;
1826     }
1827     if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1828     printferror("%s: %s", dbname, est_err_msg(ecode));
1829     return 1;
1830     }
1831     est_db_set_informer(db, dbinform);
1832     est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1833     est_db_set_special_cache(db, ESTDATTRURI, i / 10 + 1);
1834     } else if(rnd < 0.5){
1835     if(!est_db_optimize(db, (int)(est_random() * INT_MAX) % 2 == 0) ? ESTOPTNOPURGE : 0)
1836     err = TRUE;
1837     } else if(rnd < 0.8){
1838     if(!est_db_flush(db, 1024)) err = TRUE;
1839     } else {
1840     if(!est_db_sync(db)) err = TRUE;
1841     }
1842     } else if(rnd < 0.05){
1843     if(est_db_out_doc(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1844     ((int)(est_random() * INT_MAX) % 2 == 0) ? ESTODCLEAN : 0)){
1845     printfinfo("[%d:%d]: out", i + 1, est_db_doc_num(db));
1846     } else if(est_db_error(db) != ESTENOITEM){
1847     err = TRUE;
1848     }
1849     } else if(rnd < 0.1){
1850     if((value = est_db_get_doc_attr(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1851     ESTDATTRURI)) != NULL){
1852     printfinfo("[%d:%d]: attr: %s", i + 1, est_db_doc_num(db), value);
1853     free(value);
1854     }
1855     } else if(rnd < 0.25){
1856     rnd = est_random();
1857     if(rnd < 0.5){
1858     oper = " OR ";
1859     } else if(rnd < 0.7){
1860     oper = " AND ";
1861     } else if(rnd < 0.8){
1862     oper = " NOTAND ";
1863     } else if(rnd < 0.9){
1864     oper = " ";
1865     } else {
1866     oper = "";
1867     }
1868     first = est_random_str(5, (int)(est_random() * INT_MAX) % RD_RAND);
1869     second = est_random_str(2, (int)(est_random() * INT_MAX) % RD_RAND);
1870     phrase = cbsprintf("%s%s%s", first, oper, second);
1871     cond = est_cond_new();
1872     est_cond_set_phrase(cond, phrase);
1873     if(est_random() < 0.25) est_cond_add_attr(cond, "@uri STREW 0.est");
1874     if(est_random() < 0.25) est_cond_set_order(cond, "@uri STRD");
1875     if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDSURE | ESTCONDSCFB);
1876     if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDAGIT | ESTCONDNOIDF);
1877     res = est_db_search(db, cond, &rnum, NULL);
1878     printfinfo("[%d:%d]: search: %d hits", i + 1, est_db_doc_num(db), rnum);
1879     if(est_random() < 0.05){
1880     for(j = 0; j < rnum && j < 100; j++){
1881     if((doc = est_db_get_doc(db, res[j], 0)) != NULL){
1882     if(i % 10 == 0){
1883     free(est_doc_cat_texts(doc));
1884     free(est_doc_dump_draft(doc));
1885     words = cblistopen();
1886     cblistpush(words, "vw", -1);
1887     cblistpush(words, "xy", -1);
1888     cblistpush(words, "z", -1);
1889     free(est_doc_make_snippet(doc, words, 100, 10, 10));
1890     cblistclose(words);
1891     }
1892     est_doc_delete(doc);
1893     } else if(est_db_error(db) != ESTENOITEM){
1894     err = TRUE;
1895     }
1896     }
1897     }
1898     free(res);
1899     est_cond_delete(cond);
1900     free(phrase);
1901     free(first);
1902     free(second);
1903     } else {
1904     doc = est_doc_new_from_chaos(100, 3, est_random() < 0.5 ? RD_EURO : RD_RAND);
1905     if(est_random() < 0.2){
1906     sprintf(uri, "file:///tmp/wicked-%08d-%05d.est",
1907     (int)(est_random() * INT_MAX) % (i + 1) + 1, getpid());
1908     } else {
1909     sprintf(uri, "file:///tmp/wicked-%08d-%05d.est", i + 1, getpid());
1910     }
1911     est_doc_add_attr(doc, ESTDATTRURI, uri);
1912     if(!est_db_put_doc(db, doc, est_random() < 0.5 ? ESTPDCLEAN : 0)) err = TRUE;
1913     est_doc_delete(doc);
1914     }
1915     if(err || g_sigterm) break;
1916     }
1917     if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1918     if(!est_db_close(db, &ecode)){
1919     printferror("%s: %s", dbname, est_err_msg(ecode));
1920     return 1;
1921     }
1922     curtime = time(NULL) - curtime;
1923     if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1924     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1925     return err ? 1 : 0;
1926     }
1927    
1928    
1929     /* perform the regression command */
1930     static int procregression(const char *dbname){
1931     ESTDB *db;
1932     ESTDOC *doc;
1933     ESTCOND *cond;
1934     int i, ecode, err, *res, rnum;
1935     time_t curtime;
1936     curtime = time(NULL);
1937     printfinfo("# opening the database");
1938     if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1939     printferror("%s: %s", dbname, est_err_msg(ecode));
1940     return 1;
1941     }
1942     est_db_set_informer(db, dbinform);
1943     err = FALSE;
1944     if(!err){
1945     printfinfo("# checking registration of small documents");
1946     doc = est_doc_new();
1947     est_doc_add_attr(doc, ESTDATTRURI, "file:///small/one");
1948     est_doc_add_text(doc, "One!");
1949     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1950     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1951     est_doc_delete(doc);
1952     doc = est_doc_new();
1953     est_doc_add_attr(doc, ESTDATTRURI, "file:///small/two");
1954     est_doc_add_text(doc, "Two!!");
1955     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1956     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1957     est_doc_delete(doc);
1958     doc = est_doc_new();
1959     est_doc_add_attr(doc, ESTDATTRURI, "file:///small/three");
1960     est_doc_add_text(doc, "Three!!!");
1961     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1962     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1963     est_doc_delete(doc);
1964     doc = est_doc_new();
1965     est_doc_add_attr(doc, ESTDATTRURI, "file:///empty");
1966     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1967     est_doc_delete(doc);
1968     }
1969     if(!err){
1970     printfinfo("# checking registration of an english document");
1971     doc = est_doc_new();
1972     est_doc_add_attr(doc, ESTDATTRURI, "file:///english");
1973     est_doc_add_attr(doc, ESTDATTRTITLE, "Hyper Estraier");
1974     est_doc_add_text(doc, "% This is a displayed sentence. ;-)");
1975     est_doc_add_text(doc, "Hyper Estraier is a full-text search system for communities.");
1976     est_doc_add_text(doc, "A little suffering is good for the soul.");
1977     est_doc_add_text(doc, "They have been at a great feast of languages, and stolen the scraps.");
1978     est_doc_add_hidden_text(doc, "(Give it up, Yo! Give it up, Yo!)");
1979     est_doc_add_hidden_text(doc, "% This is a hidden sentence. :-<");
1980     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1981     est_doc_add_hidden_text(doc, "");
1982     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1983     est_doc_delete(doc);
1984     }
1985     if(!err){
1986     printfinfo("# checking registration of a japanese document");
1987     doc = est_doc_new();
1988     est_doc_add_attr(doc, ESTDATTRURI, "file:///japanese");
1989     est_doc_add_attr(doc, ESTDATTRTITLE, "\xe5\xb9\xb3\xe6\x9e\x97\xe5\xb9\xb9\xe9\x9b\x84");
1990     est_doc_add_text(doc, "\xe6\x9c\xac\xe6\x97\xa5\xe3\x81\xaf\xe6\x99\xb4\xe5\xa4\xa9\xe3"
1991     "\x81\xaa\xe3\x82\x8a\xe3\x80\x82");
1992     est_doc_add_text(doc, "\xe6\x9c\x95\xe3\x81\xaf\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4"
1993     "\xb8\x80\xe3\x81\xae\xe4\xb8\x8b\xe5\x83\x95\xe3\x81\xa7\xe3\x81"
1994     "\x82\xe3\x82\x8b\xe3\x80\x82");
1995     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1996     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1997     est_doc_delete(doc);
1998     }
1999     if(!err){
2000     printfinfo("# checking duplication of documents");
2001     doc = est_doc_new();
2002     est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
2003     est_doc_add_text(doc, "Gamble, you gatta chance to make a Rumble!");
2004     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
2005     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
2006     est_doc_delete(doc);
2007     doc = est_doc_new();
2008     est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
2009     est_doc_add_text(doc, "bring back hey, one more time!");
2010     est_doc_add_hidden_text(doc, "(Check it out, come on!)");
2011     if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
2012     est_doc_delete(doc);
2013     if(est_db_doc_num(db) != 7){
2014     printferror("%s: the number of documents is invalid", dbname);
2015     err = TRUE;
2016     }
2017     }
2018     if(!err){
2019     printfinfo("# checking search for unfixed documents");
2020     cond = est_cond_new();
2021     est_cond_set_phrase(cond, "check");
2022     res = est_db_search(db, cond, &rnum, NULL);
2023     if(rnum != 6){
2024     printferror("%s: the number of result is invalid", dbname);
2025     err = TRUE;
2026     }
2027     free(res);
2028     est_cond_delete(cond);
2029     }
2030     if(!err){
2031     printfinfo("# checking partial flushing of the index");
2032     if(!est_db_flush(db, 32)) err = TRUE;
2033     }
2034     if(!err){
2035     printfinfo("# checking deletion with cleaning of a document");
2036     if(!est_db_out_doc(db, 1, ESTODCLEAN)) err = TRUE;
2037     }
2038     if(!err){
2039     printfinfo("# checking synchronization");
2040     if(!est_db_sync(db)) err = TRUE;
2041     }
2042     if(!err){
2043     printfinfo("# checking deletion without cleaning of a document");
2044     if(!est_db_out_doc(db, 2, 0)) err = TRUE;
2045     }
2046     if(!err){
2047     printfinfo("# checking word search");
2048     cond = est_cond_new();
2049     est_cond_set_phrase(cond, "check it AND on");
2050     res = est_db_search(db, cond, &rnum, NULL);
2051     if(rnum != 5){
2052     printferror("%s: the number of result is invalid", dbname);
2053     err = TRUE;
2054     }
2055     free(res);
2056     est_cond_set_phrase(cond, "RUMBLE OR \xe3\x80\x82");
2057     res = est_db_search(db, cond, &rnum, NULL);
2058     if(rnum != 1){
2059     printferror("%s: the number of result is invalid", dbname);
2060     err = TRUE;
2061     }
2062     free(res);
2063     est_cond_delete(cond);
2064     }
2065     if(!err){
2066     printfinfo("# checking attribute search");
2067     cond = est_cond_new();
2068     est_cond_add_attr(cond, "@uri !ISTRINC SMaLl");
2069     res = est_db_search(db, cond, &rnum, NULL);
2070     if(rnum != est_db_doc_num(db) - 1){
2071     printferror("%s: the number of result is invalid", dbname);
2072     err = TRUE;
2073     }
2074     free(res);
2075     est_cond_delete(cond);
2076     cond = est_cond_new();
2077     est_cond_add_attr(cond, "@uri STRBW file://");
2078     est_cond_add_attr(cond, "@title STRINC \xe5\xb9\xb3");
2079     res = est_db_search(db, cond, &rnum, NULL);
2080     if(rnum != 1){
2081     printferror("%s: the number of result is invalid", dbname);
2082     err = TRUE;
2083     }
2084     free(res);
2085     est_cond_delete(cond);
2086     }
2087     if(!err){
2088     printfinfo("# checking combined search");
2089     cond = est_cond_new();
2090     est_cond_set_phrase(cond, "\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4\xb8\x80");
2091     est_cond_add_attr(cond, "@uri");
2092     est_cond_set_order(cond, "@title");
2093     res = est_db_search(db, cond, &rnum, NULL);
2094     if(rnum != 1){
2095     printferror("%s: the number of result is invalid", dbname);
2096     err = TRUE;
2097     }
2098     free(res);
2099     est_cond_delete(cond);
2100     cond = est_cond_new();
2101     est_cond_set_phrase(cond, "one | \xe3\x80\x82 | check & check it ! hogehoge");
2102     est_cond_add_attr(cond, "@uri STRBW file://");
2103     est_cond_set_order(cond, "@title STRD");
2104     est_cond_set_options(cond, ESTCONDSURE | ESTCONDNOIDF | ESTCONDSIMPLE);
2105     res = est_db_search(db, cond, &rnum, NULL);
2106     if(rnum != 4){
2107     printferror("%s: the number of result is invalid", dbname);
2108     err = TRUE;
2109     }
2110     free(res);
2111     est_cond_delete(cond);
2112     }
2113     if(!err){
2114     printfinfo("# checking optimization");
2115     if(!est_db_optimize(db, 0)) err = TRUE;
2116     cond = est_cond_new();
2117     est_cond_set_phrase(cond, "check");
2118     res = est_db_search(db, cond, &rnum, NULL);
2119     if(rnum != 4){
2120     printferror("%s: the number of result is invalid", dbname);
2121     err = TRUE;
2122     }
2123     free(res);
2124     est_cond_delete(cond);
2125     }
2126     if(!err){
2127     printfinfo("# checking traversal access");
2128     cond = est_cond_new();
2129     est_cond_set_phrase(cond, "[UVSET]");
2130     res = est_db_search(db, cond, &rnum, NULL);
2131     for(i = 0; i < rnum; i++){
2132     if(!(doc = est_db_get_doc(db, res[i], 0))){
2133     printferror("%s: a document cannot be retrieved", dbname);
2134     err = TRUE;
2135     break;
2136     }
2137     est_doc_delete(doc);
2138     }
2139     free(res);
2140     est_cond_delete(cond);
2141     }
2142     if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
2143     printfinfo("# closing the database");
2144     if(!est_db_close(db, &ecode)){
2145     printferror("%s: %s", dbname, est_err_msg(ecode));
2146     return 1;
2147     }
2148     curtime = time(NULL) - curtime;
2149     if(!err) printfinfo("# finished successfully: elapsed time: %dh %dm %ds",
2150     (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
2151     return err ? 1 : 0;
2152     }
2153    
2154    
2155     /* output escaped string */
2156     static void xmlprintf(const char *format, ...){
2157     va_list ap;
2158     char *tmp, cbuf[32];
2159     unsigned char c;
2160     int cblen;
2161     va_start(ap, format);
2162     while(*format != '\0'){
2163     if(*format == '%'){
2164     cbuf[0] = '%';
2165     cblen = 1;
2166     format++;
2167     while(strchr("0123456789 .+-", *format) && *format != '\0' && cblen < 31){
2168     cbuf[cblen++] = *format;
2169     format++;
2170     }
2171     cbuf[cblen++] = *format;
2172     cbuf[cblen] = '\0';
2173     switch(*format){
2174     case 's':
2175     tmp = va_arg(ap, char *);
2176     if(!tmp) tmp = "(null)";
2177     printf(cbuf, tmp);
2178     break;
2179     case 'd':
2180     printf(cbuf, va_arg(ap, int));
2181     break;
2182     case 'o': case 'u': case 'x': case 'X': case 'c':
2183     printf(cbuf, va_arg(ap, unsigned int));
2184     break;
2185     case 'e': case 'E': case 'f': case 'g': case 'G':
2186     printf(cbuf, va_arg(ap, double));
2187     break;
2188     case '@':
2189     tmp = va_arg(ap, char *);
2190     if(!tmp) tmp = "(null)";
2191     while(*tmp){
2192     switch(*tmp){
2193     case '&': printf("&amp;"); break;
2194     case '<': printf("&lt;"); break;
2195     case '>': printf("&gt;"); break;
2196     case '"': printf("&quot;"); break;
2197     default:
2198     if(!((*tmp >= 0 && *tmp <= 0x8) || (*tmp >= 0x0e && *tmp <= 0x1f))) putchar(*tmp);
2199     break;
2200     }
2201     tmp++;
2202     }
2203     break;
2204     case '?':
2205     tmp = va_arg(ap, char *);
2206     if(!tmp) tmp = "(null)";
2207     while(*tmp){
2208     c = *(unsigned char *)tmp;
2209     if((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
2210     (c >= '0' && c <= '9') || (c != '\0' && strchr("_-.", c))){
2211     putchar(c);
2212     } else {
2213     printf("%%%02X", c);
2214     }
2215     tmp++;
2216     }
2217     break;
2218     case '%':
2219     putchar('%');
2220     break;
2221     }
2222     } else {
2223     putchar(*format);
2224     }
2225     format++;
2226     }
2227     va_end(ap);
2228     }
2229    
2230    
2231     /* get the language value */
2232     static int strtolang(const char *str){
2233     if(!cbstricmp(str, "en")) return ESTLANGEN;
2234     if(!cbstricmp(str, "ja")) return ESTLANGJA;
2235     if(!cbstricmp(str, "zh")) return ESTLANGZH;
2236     if(!cbstricmp(str, "ko")) return ESTLANGKO;
2237     return ESTLANGMISC;
2238     }
2239    
2240    
2241     /* read a line */
2242     static char *fgetl(FILE *ifp){
2243     char *buf;
2244     int c, len, blen;
2245     buf = NULL;
2246     len = 0;
2247     blen = 1024;
2248     while((c = fgetc(ifp)) != EOF){
2249     if(blen <= len) blen *= 2;
2250     buf = cbrealloc(buf, blen + 1);
2251     if(c == '\n') c = '\0';
2252     if(c != '\r') buf[len++] = c;
2253     if(c == '\0') break;
2254     }
2255     if(!buf) return NULL;
2256     buf[len] = '\0';
2257     return buf;
2258     }
2259    
2260    
2261     /* register a document */
2262 dpavlin 10 static int doputdoc(ESTDB *db, const char *path, const CBLIST *attrs){
2263 dpavlin 2 ESTDOC *doc, *edoc;
2264     const char *uri, *vbuf, *xcmd;
2265     char *dbuf, *tbuf;
2266 dpavlin 10 int i, err, fmt, id, dsiz;
2267 dpavlin 2 time_t emdate, fmdate;
2268     struct stat sbuf;
2269     xcmd = NULL;
2270     if(cbmaprnum(g_xcmdmap) > 0){
2271     cbmapiterinit(g_xcmdmap);
2272     while((vbuf = cbmapiternext(g_xcmdmap, NULL)) != NULL){
2273     if(cbstrbwimatch(path, vbuf)){
2274     xcmd = cbmapget(g_xcmdmap, vbuf, -1, NULL);
2275     break;
2276     }
2277     }
2278     }
2279     fmt = g_filefmt;
2280     if(g_filefmt == FF_NONE && !xcmd) return TRUE;
2281     if(g_filefmt == FF_AUTO){
2282     if(cbstrbwimatch(path, ESTEXTSTR "est")){
2283     fmt = FF_DRAFT;
2284     } else if(cbstrbwimatch(path, ESTEXTSTR "txt") || cbstrbwimatch(path, ESTEXTSTR "text") ||
2285     cbstrbwimatch(path, ESTEXTSTR "asc")){
2286     fmt = FF_TEXT;
2287     } else if(cbstrbwimatch(path, ESTEXTSTR "html") || cbstrbwimatch(path, ESTEXTSTR "htm") ||
2288     cbstrbwimatch(path, ESTEXTSTR "xhtml") || cbstrbwimatch(path, ESTEXTSTR "xht")){
2289     fmt = FF_HTML;
2290     } else if(cbstrbwimatch(path, ESTEXTSTR "eml") || cbstrbwimatch(path, ESTEXTSTR "mime") ||
2291     cbstrbwimatch(path, ESTEXTSTR "mht") || cbstrbwimatch(path, ESTEXTSTR "mhtml")){
2292     fmt = FF_MIME;
2293     } else if(!xcmd){
2294     return TRUE;
2295     }
2296     }
2297     if(stat(path, &sbuf) == -1 || !S_ISREG(sbuf.st_mode) || !(uri = pathtourl(path))){
2298     printferror("%s: could not open", path);
2299     return TRUE;
2300     }
2301     emdate = -1;
2302     if(g_chkmdate && (id = est_db_uri_to_id(db, uri)) > 0 &&
2303     (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2304     if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2305     est_doc_delete(edoc);
2306     }
2307     if(g_stdate && emdate >= 0 && emdate >= sbuf.st_mtime){
2308     printfinfo("%s: passed", path);
2309     return TRUE;
2310     }
2311     if(g_filtorig){
2312     dbuf = cbmemdup("", 0);
2313     dsiz = 0;
2314     } else {
2315     if(!(dbuf = cbreadfile(path, &dsiz))){
2316     printferror("%s: could not open", path);
2317     return TRUE;
2318     }
2319     }
2320     if(xcmd){
2321     doc = est_doc_new_with_xcmd(dbuf, dsiz, path, xcmd, est_db_name(db),
2322     g_inputcode, g_inputlang);
2323     } else {
2324     switch(fmt){
2325     case FF_TEXT:
2326     doc = est_doc_new_from_text(dbuf, dsiz, g_inputcode, g_inputlang);
2327     break;
2328     case FF_HTML:
2329     doc = est_doc_new_from_html(dbuf, dsiz, g_inputcode, g_inputlang);
2330     break;
2331     case FF_MIME:
2332     doc = est_doc_new_from_mime(dbuf, dsiz, g_inputcode, g_inputlang);
2333     break;
2334     default:
2335     doc = est_doc_new_from_draft_enc(dbuf, dsiz, g_inputcode);
2336     break;
2337     }
2338     }
2339 dpavlin 10 if(attrs){
2340     for(i = 0; i < cblistnum(g_pathattrs) && i < cblistnum(attrs); i++){
2341     est_doc_add_attr(doc, cblistval(g_pathattrs, i, NULL), cblistval(attrs, i, NULL));
2342     }
2343     }
2344 dpavlin 2 if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri);
2345     est_doc_add_attr(doc, DATTRLPATH, uri);
2346     est_doc_add_attr(doc, DATTRLFILE, urltofile(uri));
2347     uri = est_doc_attr(doc, ESTDATTRURI);
2348     if(g_stdate){
2349     tbuf = cbdatestrwww(sbuf.st_ctime, 0);
2350     est_doc_add_attr(doc, ESTDATTRCDATE, tbuf);
2351     free(tbuf);
2352     tbuf = cbdatestrwww(sbuf.st_mtime, 0);
2353     est_doc_add_attr(doc, ESTDATTRMDATE, tbuf);
2354     free(tbuf);
2355     }
2356     if(g_chkmdate && emdate == -1 && (id = est_db_uri_to_id(db, uri)) > 0 &&
2357     (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2358     if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2359     est_doc_delete(edoc);
2360     }
2361     fmdate = -1;
2362     if(g_chkmdate && (vbuf = est_doc_attr(doc, ESTDATTRMDATE)) != NULL) fmdate = cbstrmktime(vbuf);
2363     err = FALSE;
2364     if(emdate >= 0 && emdate >= fmdate){
2365     printfinfo("%s: passed", path);
2366     } else if(est_db_put_doc(db, doc, g_putopts)){
2367     printfinfo("%d (%s): registered", est_doc_id(doc), uri);
2368     } else {
2369     printferror("%s: %s", est_db_name(db), est_err_msg(est_db_error(db)));
2370     err = TRUE;
2371     }
2372     est_doc_delete(doc);
2373     free(dbuf);
2374     return err ? FALSE : TRUE;
2375     }
2376    
2377    
2378     /* get the URL of a path */
2379     static const char *pathtourl(const char *path){
2380     static char pbuf[URIBUFSIZ];
2381     const char *elem;
2382     char *wp, *ebuf;
2383     CBLIST *list;
2384     int i, esiz;
2385     if(strlen(path) >= URIBUFSIZ / 4) return NULL;
2386     if(g_pathcode){
2387     wp = est_realpath(path);
2388     if(!(ebuf = est_iconv(wp, -1, g_pathcode, "UTF-8", &esiz, NULL))){
2389     esiz = strlen(wp);
2390     ebuf = cbmemdup(wp, esiz);
2391     }
2392     list = cbsplit(ebuf, esiz, ESTPATHSTR);
2393     free(ebuf);
2394     free(wp);
2395     for(i = 0; i < cblistnum(list); i++){
2396     elem = cblistval(list, i, &esiz);
2397     if((ebuf = est_iconv(elem, esiz, "UTF-8", g_pathcode, &esiz, NULL)) != NULL){
2398     cblistover(list, i, ebuf, esiz);
2399     free(ebuf);
2400     }
2401     }
2402     } else {
2403     wp = est_realpath(path);
2404     list = cbsplit(wp, -1, ESTPATHSTR);
2405     free(wp);
2406     }
2407     wp = pbuf;
2408     wp += sprintf(wp, "file://");
2409     for(i = 0; i < cblistnum(list); i++){
2410     elem = cblistval(list, i, NULL);
2411     if(elem[0] == '\0') continue;
2412     if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2413     elem[1] == ':'){
2414     wp += sprintf(wp, "%c|", elem[0]);
2415     continue;
2416     }
2417     ebuf = cburlencode(elem, -1);
2418     wp += sprintf(wp, "/%s", ebuf);
2419     free(ebuf);
2420     }
2421     *wp = '\0';
2422     cblistclose(list);
2423     return pbuf;
2424     }
2425    
2426    
2427     /* get the file name of a URL */
2428     static const char *urltofile(const char *uri){
2429     static char pbuf[URIBUFSIZ];
2430     const char *rp;
2431     char *dbuf, *ebuf;
2432     int dsiz;
2433     if(g_pathfull){
2434     if((rp = strstr(uri, "//")) != NULL){
2435     rp += 2;
2436     if(((rp[0] >= 'A' && rp[0] <= 'Z') || (rp[0] >= 'a' && rp[0] <= 'z')) &&
2437     rp[1] == '|' && rp[2] == '/') rp += 2;
2438     } else {
2439     rp = uri;
2440     }
2441     } else if((rp = strrchr(uri, '/')) != NULL){
2442     rp++;
2443     } else {
2444     rp = uri;
2445     }
2446     dbuf = cburldecode(rp, &dsiz);
2447     if((ebuf = est_iconv(dbuf, dsiz, g_pathcode ? g_pathcode : "ISO-8859-1", "UTF-8", NULL, NULL))
2448     != NULL){
2449     sprintf(pbuf, "%s", ebuf);
2450     free(ebuf);
2451     } else {
2452     sprintf(pbuf, "%s", rp);
2453     }
2454     free(dbuf);
2455     return pbuf;
2456     }
2457    
2458    
2459     /* geth the local path of a URL */
2460     static char *urltopath(const char *uri){
2461     static char pbuf[URIBUFSIZ];
2462     const char *elem;
2463     char *wp, *dbuf;
2464     CBLIST *list;
2465     int i;
2466     if(!cbstrfwimatch(uri, "file://")) return NULL;
2467     if(!(uri = strchr(uri + 7, '/'))) return NULL;
2468     list = cbsplit(uri, -1, "/");
2469     wp = pbuf;
2470     for(i = 0; i < cblistnum(list); i++){
2471     elem = cblistval(list, i, NULL);
2472     if(elem[0] == '\0') continue;
2473     if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2474     elem[1] == '|'){
2475     wp += sprintf(wp, "%c:", elem[0]);
2476     continue;
2477     }
2478     dbuf = cburldecode(elem, NULL);
2479     wp += sprintf(wp, "%c%s", ESTPATHCHR, dbuf);
2480     free(dbuf);
2481     }
2482     *wp = '\0';
2483     cblistclose(list);
2484     return pbuf;
2485     }
2486    
2487    
2488     /* create a vector of keywords */
2489     static CBMAP *vectorizer(void *db, int id, void *kwdb){
2490     CBMAP *kwords;
2491     char *mbuf;
2492     int msiz;
2493     if(!(mbuf = crget((CURIA *)kwdb, (char *)&id, sizeof(int), 0, -1, &msiz))) return NULL;
2494     kwords = cbmapload(mbuf, msiz);
2495     free(mbuf);
2496     return kwords;
2497     }
2498    
2499    
2500     /* create a document object with an outer command */
2501     static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
2502     const char *xcmd, const char *tmpdir,
2503     const char *penc, int plang){
2504     ESTDOC *doc;
2505     const char *pv, *ext;
2506     char iname[URIBUFSIZ], oname[URIBUFSIZ], ebuf[URIBUFSIZ], cmd[URIBUFSIZ];
2507     char *rbuf, numbuf[NUMBUFSIZ];
2508     int fmt, rsiz;
2509     assert(buf && size >= 0 && path && xcmd && tmpdir);
2510     sprintf(ebuf, "ESTORIGFILE=%s", path);
2511     ext = NULL;
2512     if((pv = strrchr(path, ESTPATHCHR)) != NULL) path = pv;
2513     if((pv = strrchr(path, ESTEXTCHR)) != NULL) ext = pv;
2514     if(!ext) ext = "";
2515     sprintf(iname, "%s%cxcmd-in-%08d%s", tmpdir, ESTPATHCHR, getpid(), ext);
2516     sprintf(oname, "%s%cxcmd-out-%08d%cest", tmpdir, ESTPATHCHR, getpid(), ESTEXTCHR);
2517     fmt = FF_DRAFT;
2518     if(cbstrfwmatch(xcmd, "T@")){
2519     fmt = FF_TEXT;
2520     xcmd += 2;
2521     } else if(cbstrfwmatch(xcmd, "H@")){
2522     fmt = FF_HTML;
2523     xcmd += 2;
2524     } else if(cbstrfwmatch(xcmd, "M@")){
2525     fmt = FF_MIME;
2526     xcmd += 2;
2527     }
2528     sprintf(cmd, "%s %s %s", xcmd, iname, oname);
2529     if(!g_filtorig) cbwritefile(iname, buf, size);
2530     putenv(ebuf);
2531     system(cmd);
2532     if((rbuf = cbreadfile(oname, &rsiz)) != NULL){
2533     switch(fmt){
2534     case FF_TEXT:
2535     doc = est_doc_new_from_text(rbuf, rsiz, penc, plang);
2536     break;
2537     case FF_HTML:
2538     doc = est_doc_new_from_html(rbuf, rsiz, penc, plang);
2539     break;
2540     case FF_MIME:
2541     doc = est_doc_new_from_mime(rbuf, rsiz, penc, plang);
2542     break;
2543     default:
2544     doc = est_doc_new_from_draft_enc(rbuf, rsiz, penc);
2545     break;
2546     }
2547     free(rbuf);
2548     } else {
2549     doc = est_doc_new();
2550     }
2551     if(fmt != FF_DRAFT){
2552     sprintf(numbuf, "%d", size);
2553     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2554     est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext));
2555     }
2556     unlink(oname);
2557     unlink(iname);
2558     return doc;
2559     }
2560    
2561    
2562     /* create a document object from draft data in another encoding */
2563     static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc){
2564     ESTDOC *doc;
2565     char *rbuf;
2566     assert(buf);
2567     if(enc && (rbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL)) != NULL){
2568     doc = est_doc_new_from_draft(rbuf);
2569     free(rbuf);
2570     } else {
2571     doc = est_doc_new_from_draft(buf);
2572     }
2573     return doc;
2574     }
2575    
2576    
2577     /* create a document object from plain text */
2578     static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang){
2579     ESTDOC *doc;
2580     CBLIST *lines;
2581     CBDATUM *datum;
2582     const char *enc, *text, *line;
2583     char *nbuf, numbuf[NUMBUFSIZ];
2584     int i;
2585     assert(buf);
2586     doc = est_doc_new();
2587     enc = penc ? penc : est_enc_name(buf, size, plang);
2588     if(!strcmp(enc, "UTF-8")){
2589     nbuf = NULL;
2590     text = buf;
2591     } else {
2592     text = buf;
2593     nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2594     if(nbuf) text = nbuf;
2595     }
2596     lines = cbsplit(text, -1, "\n");
2597     datum = cbdatumopen("", 0);
2598     for(i = 0; i < CB_LISTNUM(lines); i++){
2599     line = CB_LISTVAL(lines, i, NULL);
2600     while(*line == ' ' || *line == '\t' || *line == '\r'){
2601     line++;
2602     }
2603     if(line[0] == '\0'){
2604     est_doc_add_text(doc, CB_DATUMPTR(datum));
2605     cbdatumsetsize(datum, 0);
2606     } else {
2607     cbdatumcat(datum, " ", 1);
2608     cbdatumcat(datum, line, -1);
2609     }
2610     }
2611     est_doc_add_text(doc, CB_DATUMPTR(datum));
2612     cbdatumclose(datum);
2613     cblistclose(lines);
2614     est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain");
2615     sprintf(numbuf, "%d", size);
2616     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2617     if(nbuf) free(nbuf);
2618     return doc;
2619     }
2620    
2621    
2622     /* create a document object from HTML */
2623     static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang){
2624     ESTDOC *doc;
2625     CBLIST *elems;
2626     CBMAP *attrs;
2627     CBDATUM *datum;
2628     const char *enc, *html, *elem, *next, *name, *content;
2629     char *nbuf, *nenc, *rbuf, *lbuf, numbuf[NUMBUFSIZ];
2630     int i, esiz;
2631     assert(buf);
2632     doc = est_doc_new();
2633     enc = est_enc_name(buf, size, plang);
2634     html = NULL;
2635     nbuf = NULL;
2636     if(!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")){
2637     nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2638     } else if(!strcmp(enc, "US-ASCII")){
2639     nbuf = NULL;
2640     } else {
2641     if((nenc = penc ? cbmemdup(penc, -1) : est_html_enc(buf)) != NULL){
2642     if(cbstricmp(nenc, "UTF-8")){
2643     nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL);
2644     if(!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2645     }
2646     free(nenc);
2647     } else {
2648     nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2649     }
2650     }
2651     if(nbuf) html = nbuf;
2652     if(!html) html = buf;
2653     datum = cbdatumopen("", 0);
2654     elems = cbxmlbreak(html, TRUE);
2655     for(i = 0; i < CB_LISTNUM(elems); i++){
2656     elem = CB_LISTVAL2(elems, i, &esiz);
2657     if(!(next = cblistval(elems, i + 1, NULL))) next = "";
2658     if(elem[0] == '<'){
2659     if(cbstrfwimatch(elem, "<meta")){
2660     attrs = cbxmlattrs(elem);
2661     name = cbmapget(attrs, "name", -1, NULL);
2662     if(!name) name = cbmapget(attrs, "Name", -1, NULL);
2663     if(!name) name = cbmapget(attrs, "NAME", -1, NULL);
2664     if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL);
2665     if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL);
2666     if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL);
2667     if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2668     content = cbmapget(attrs, "content", -1, NULL);
2669     if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2670     if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2671     if(name && content){
2672     lbuf = cbmemdup(name, -1);
2673     cbstrtolower(lbuf);
2674     cbstrsqzspc(lbuf);
2675     if(!strcmp(lbuf, "author")){
2676     if(strchr(content, '&')){
2677     rbuf = est_html_raw_text(content);
2678     est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
2679     free(rbuf);
2680     } else {
2681     est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
2682     }
2683     }
2684     if(name[0] != '@'){
2685     if(strchr(content, '&')){
2686     rbuf = est_html_raw_text(content);
2687     est_doc_add_attr(doc, lbuf, rbuf);
2688     free(rbuf);
2689     } else {
2690     est_doc_add_attr(doc, lbuf, content);
2691     }
2692     }
2693     free(lbuf);
2694     }
2695     cbmapclose(attrs);
2696     } else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){
2697     if(strchr(next, '&')){
2698     rbuf = est_html_raw_text(next);
2699     est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
2700     est_doc_add_hidden_text(doc, rbuf);
2701     free(rbuf);
2702     } else {
2703     est_doc_add_attr(doc, ESTDATTRTITLE, next);
2704     est_doc_add_hidden_text(doc, next);
2705     }
2706     i++;
2707     } else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){
2708     i++;
2709     } else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") ||
2710     cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") ||
2711     cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") ||
2712     cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") ||
2713     cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") ||
2714     cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") ||
2715     cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") ||
2716     cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") ||
2717     cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") ||
2718     cbstrfwimatch(elem, "<pre")){
2719     if(strchr(CB_DATUMPTR(datum), '&')){
2720     rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2721     est_doc_add_text(doc, rbuf);
2722     free(rbuf);
2723     } else {
2724     est_doc_add_text(doc, CB_DATUMPTR(datum));
2725     }
2726     cbdatumsetsize(datum, 0);
2727     }
2728     } else {
2729     cbdatumcat(datum, " ", -1);
2730     cbdatumcat(datum, elem, esiz);
2731     }
2732     }
2733     cblistclose(elems);
2734     if(strchr(CB_DATUMPTR(datum), '&')){
2735     rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2736     est_doc_add_text(doc, rbuf);
2737     free(rbuf);
2738     } else {
2739     est_doc_add_text(doc, CB_DATUMPTR(datum));
2740     }
2741     cbdatumclose(datum);
2742     if(nbuf) free(nbuf);
2743     est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
2744     sprintf(numbuf, "%d", size);
2745     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2746     return doc;
2747     }
2748    
2749    
2750     /* get the encoding of an HTML string */
2751     static char *est_html_enc(const char *str){
2752     CBLIST *elems;
2753     CBMAP *attrs;
2754     const char *elem, *equiv, *content;
2755     char *enc, *pv;
2756     int i;
2757     assert(str);
2758     elems = cbxmlbreak(str, TRUE);
2759     for(i = 0; i < CB_LISTNUM(elems); i++){
2760     elem = CB_LISTVAL(elems, i, NULL);
2761     if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue;
2762     enc = NULL;
2763     attrs = cbxmlattrs(elem);
2764     equiv = cbmapget(attrs, "http-equiv", -1, NULL);
2765     if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2766     if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL);
2767     if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL);
2768     if(equiv && !cbstricmp(equiv, "Content-Type")){
2769     content = cbmapget(attrs, "content", -1, NULL);
2770     if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2771     if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2772     if(content && ((pv = strstr(content, "charset")) != NULL ||
2773     (pv = strstr(content, "Charset")) != NULL ||
2774     (pv = strstr(content, "CHARSET")) != NULL)){
2775     enc = cbmemdup(pv + 8, -1);
2776     if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL ||
2777     (pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0';
2778     }
2779     }
2780     cbmapclose(attrs);
2781     if(enc){
2782     cblistclose(elems);
2783     return enc;
2784     }
2785     }
2786     cblistclose(elems);
2787     return NULL;
2788     }
2789    
2790    
2791     /* unescape entity references of HTML */
2792     static char *est_html_raw_text(const char *html){
2793     static const char *pairs[] = {
2794     /* basic symbols */
2795     "&amp;", "&", "&lt;", "<", "&gt;", ">", "&quot;", "\"", "&apos;", "'",
2796     /* ISO-8859-1 */
2797     "&nbsp;", "\xc2\xa0", "&iexcl;", "\xc2\xa1", "&cent;", "\xc2\xa2",
2798     "&pound;", "\xc2\xa3", "&curren;", "\xc2\xa4", "&yen;", "\xc2\xa5",
2799     "&brvbar;", "\xc2\xa6", "&sect;", "\xc2\xa7", "&uml;", "\xc2\xa8",
2800     "&copy;", "\xc2\xa9", "&ordf;", "\xc2\xaa", "&laquo;", "\xc2\xab",
2801     "&not;", "\xc2\xac", "&shy;", "\xc2\xad", "&reg;", "\xc2\xae",
2802     "&macr;", "\xc2\xaf", "&deg;", "\xc2\xb0", "&plusmn;", "\xc2\xb1",
2803     "&sup2;", "\xc2\xb2", "&sup3;", "\xc2\xb3", "&acute;", "\xc2\xb4",
2804     "&micro;", "\xc2\xb5", "&para;", "\xc2\xb6", "&middot;", "\xc2\xb7",
2805     "&cedil;", "\xc2\xb8", "&sup1;", "\xc2\xb9", "&ordm;", "\xc2\xba",
2806     "&raquo;", "\xc2\xbb", "&frac14;", "\xc2\xbc", "&frac12;", "\xc2\xbd",
2807     "&frac34;", "\xc2\xbe", "&iquest;", "\xc2\xbf", "&Agrave;", "\xc3\x80",
2808     "&Aacute;", "\xc3\x81", "&Acirc;", "\xc3\x82", "&Atilde;", "\xc3\x83",
2809     "&Auml;", "\xc3\x84", "&Aring;", "\xc3\x85", "&AElig;", "\xc3\x86",
2810     "&Ccedil;", "\xc3\x87", "&Egrave;", "\xc3\x88", "&Eacute;", "\xc3\x89",
2811     "&Ecirc;", "\xc3\x8a", "&Euml;", "\xc3\x8b", "&Igrave;", "\xc3\x8c",
2812     "&Iacute;", "\xc3\x8d", "&Icirc;", "\xc3\x8e", "&Iuml;", "\xc3\x8f",
2813     "&ETH;", "\xc3\x90", "&Ntilde;", "\xc3\x91", "&Ograve;", "\xc3\x92",
2814     "&Oacute;", "\xc3\x93", "&Ocirc;", "\xc3\x94", "&Otilde;", "\xc3\x95",
2815     "&Ouml;", "\xc3\x96", "&times;", "\xc3\x97", "&Oslash;", "\xc3\x98",
2816     "&Ugrave;", "\xc3\x99", "&Uacute;", "\xc3\x9a", "&Ucirc;", "\xc3\x9b",
2817     "&Uuml;", "\xc3\x9c", "&Yacute;", "\xc3\x9d", "&THORN;", "\xc3\x9e",
2818     "&szlig;", "\xc3\x9f", "&agrave;", "\xc3\xa0", "&aacute;", "\xc3\xa1",
2819     "&acirc;", "\xc3\xa2", "&atilde;", "\xc3\xa3", "&auml;", "\xc3\xa4",
2820     "&aring;", "\xc3\xa5", "&aelig;", "\xc3\xa6", "&ccedil;", "\xc3\xa7",
2821     "&egrave;", "\xc3\xa8", "&eacute;", "\xc3\xa9", "&ecirc;", "\xc3\xaa",
2822     "&euml;", "\xc3\xab", "&igrave;", "\xc3\xac", "&iacute;", "\xc3\xad",
2823     "&icirc;", "\xc3\xae", "&iuml;", "\xc3\xaf", "&eth;", "\xc3\xb0",
2824     "&ntilde;", "\xc3\xb1", "&ograve;", "\xc3\xb2", "&oacute;", "\xc3\xb3",
2825     "&ocirc;", "\xc3\xb4", "&otilde;", "\xc3\xb5", "&ouml;", "\xc3\xb6",
2826     "&divide;", "\xc3\xb7", "&oslash;", "\xc3\xb8", "&ugrave;", "\xc3\xb9",
2827     "&uacute;", "\xc3\xba", "&ucirc;", "\xc3\xbb", "&uuml;", "\xc3\xbc",
2828     "&yacute;", "\xc3\xbd", "&thorn;", "\xc3\xbe", "&yuml;", "\xc3\xbf",
2829     /* ISO-10646 */
2830     "&fnof;", "\xc6\x92", "&Alpha;", "\xce\x91", "&Beta;", "\xce\x92",
2831     "&Gamma;", "\xce\x93", "&Delta;", "\xce\x94", "&Epsilon;", "\xce\x95",
2832     "&Zeta;", "\xce\x96", "&Eta;", "\xce\x97", "&Theta;", "\xce\x98",
2833     "&Iota;", "\xce\x99", "&Kappa;", "\xce\x9a", "&Lambda;", "\xce\x9b",
2834     "&Mu;", "\xce\x9c", "&Nu;", "\xce\x9d", "&Xi;", "\xce\x9e",
2835     "&Omicron;", "\xce\x9f", "&Pi;", "\xce\xa0", "&Rho;", "\xce\xa1",
2836     "&Sigma;", "\xce\xa3", "&Tau;", "\xce\xa4", "&Upsilon;", "\xce\xa5",
2837     "&Phi;", "\xce\xa6", "&Chi;", "\xce\xa7", "&Psi;", "\xce\xa8",
2838     "&Omega;", "\xce\xa9", "&alpha;", "\xce\xb1", "&beta;", "\xce\xb2",
2839     "&gamma;", "\xce\xb3", "&delta;", "\xce\xb4", "&epsilon;", "\xce\xb5",
2840     "&zeta;", "\xce\xb6", "&eta;", "\xce\xb7", "&theta;", "\xce\xb8",
2841     "&iota;", "\xce\xb9", "&kappa;", "\xce\xba", "&lambda;", "\xce\xbb",
2842     "&mu;", "\xce\xbc", "&nu;", "\xce\xbd", "&xi;", "\xce\xbe",
2843     "&omicron;", "\xce\xbf", "&pi;", "\xcf\x80", "&rho;", "\xcf\x81",
2844     "&sigmaf;", "\xcf\x82", "&sigma;", "\xcf\x83", "&tau;", "\xcf\x84",
2845     "&upsilon;", "\xcf\x85", "&phi;", "\xcf\x86", "&chi;", "\xcf\x87",
2846     "&psi;", "\xcf\x88", "&omega;", "\xcf\x89", "&thetasym;", "\xcf\x91",
2847     "&upsih;", "\xcf\x92", "&piv;", "\xcf\x96", "&bull;", "\xe2\x80\xa2",
2848     "&hellip;", "\xe2\x80\xa6", "&prime;", "\xe2\x80\xb2", "&Prime;", "\xe2\x80\xb3",
2849     "&oline;", "\xe2\x80\xbe", "&frasl;", "\xe2\x81\x84", "&weierp;", "\xe2\x84\x98",
2850     "&image;", "\xe2\x84\x91", "&real;", "\xe2\x84\x9c", "&trade;", "\xe2\x84\xa2",
2851     "&alefsym;", "\xe2\x84\xb5", "&larr;", "\xe2\x86\x90", "&uarr;", "\xe2\x86\x91",
2852     "&rarr;", "\xe2\x86\x92", "&darr;", "\xe2\x86\x93", "&harr;", "\xe2\x86\x94",
2853     "&crarr;", "\xe2\x86\xb5", "&lArr;", "\xe2\x87\x90", "&uArr;", "\xe2\x87\x91",
2854     "&rArr;", "\xe2\x87\x92", "&dArr;", "\xe2\x87\x93", "&hArr;", "\xe2\x87\x94",
2855     "&forall;", "\xe2\x88\x80", "&part;", "\xe2\x88\x82", "&exist;", "\xe2\x88\x83",
2856     "&empty;", "\xe2\x88\x85", "&nabla;", "\xe2\x88\x87", "&isin;", "\xe2\x88\x88",
2857     "&notin;", "\xe2\x88\x89", "&ni;", "\xe2\x88\x8b", "&prod;", "\xe2\x88\x8f",
2858     "&sum;", "\xe2\x88\x91", "&minus;", "\xe2\x88\x92", "&lowast;", "\xe2\x88\x97",
2859     "&radic;", "\xe2\x88\x9a", "&prop;", "\xe2\x88\x9d", "&infin;", "\xe2\x88\x9e",
2860     "&ang;", "\xe2\x88\xa0", "&and;", "\xe2\x88\xa7", "&or;", "\xe2\x88\xa8",
2861     "&cap;", "\xe2\x88\xa9", "&cup;", "\xe2\x88\xaa", "&int;", "\xe2\x88\xab",
2862     "&there4;", "\xe2\x88\xb4", "&sim;", "\xe2\x88\xbc", "&cong;", "\xe2\x89\x85",
2863     "&asymp;", "\xe2\x89\x88", "&ne;", "\xe2\x89\xa0", "&equiv;", "\xe2\x89\xa1",
2864     "&le;", "\xe2\x89\xa4", "&ge;", "\xe2\x89\xa5", "&sub;", "\xe2\x8a\x82",
2865     "&sup;", "\xe2\x8a\x83", "&nsub;", "\xe2\x8a\x84", "&sube;", "\xe2\x8a\x86",
2866     "&supe;", "\xe2\x8a\x87", "&oplus;", "\xe2\x8a\x95", "&otimes;", "\xe2\x8a\x97",
2867     "&perp;", "\xe2\x8a\xa5", "&sdot;", "\xe2\x8b\x85", "&lceil;", "\xe2\x8c\x88",
2868     "&rceil;", "\xe2\x8c\x89", "&lfloor;", "\xe2\x8c\x8a", "&rfloor;", "\xe2\x8c\x8b",
2869     "&lang;", "\xe2\x8c\xa9", "&rang;", "\xe2\x8c\xaa", "&loz;", "\xe2\x97\x8a",
2870     "&spades;", "\xe2\x99\xa0", "&clubs;", "\xe2\x99\xa3", "&hearts;", "\xe2\x99\xa5",
2871     "&diams;", "\xe2\x99\xa6", "&OElig;", "\xc5\x92", "&oelig;", "\xc5\x93",
2872     "&Scaron;", "\xc5\xa0", "&scaron;", "\xc5\xa1", "&Yuml;", "\xc5\xb8",
2873     "&circ;", "\xcb\x86", "&tilde;", "\xcb\x9c", "&ensp;", "\xe2\x80\x82",
2874     "&emsp;", "\xe2\x80\x83", "&thinsp;", "\xe2\x80\x89", "&zwnj;", "\xe2\x80\x8c",
2875     "&zwj;", "\xe2\x80\x8d", "&lrm;", "\xe2\x80\x8e", "&rlm;", "\xe2\x80\x8f",
2876     "&ndash;", "\xe2\x80\x93", "&mdash;", "\xe2\x80\x94", "&lsquo;", "\xe2\x80\x98",
2877     "&rsquo;", "\xe2\x80\x99", "&sbquo;", "\xe2\x80\x9a", "&ldquo;", "\xe2\x80\x9c",
2878     "&rdquo;", "\xe2\x80\x9d", "&bdquo;", "\xe2\x80\x9e", "&dagger;", "\xe2\x80\xa0",
2879     "&Dagger;", "\xe2\x80\xa1", "&permil;", "\xe2\x80\xb0", "&lsaquo;", "\xe2\x80\xb9",
2880     "&rsaquo;", "\xe2\x80\xba", "&euro;", "\xe2\x82\xac",
2881     NULL
2882     };
2883     char *raw, *wp, buf[2], *tmp;
2884     int i, j, hit, num, tsiz;
2885     assert(html);
2886     CB_MALLOC(raw, strlen(html) * 3 + 1);
2887     wp = raw;
2888     while(*html != '\0'){
2889     if(*html == '&'){
2890     if(*(html + 1) == '#'){
2891     if(*(html + 2) == 'x' || *(html + 2) == 'X'){
2892     num = strtol(html + 3, NULL, 16);
2893     } else {
2894     num = atoi(html + 2);
2895     }
2896     buf[0] = num / 256;
2897     buf[1] = num % 256;
2898     if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){
2899     for(j = 0; j < tsiz; j++){
2900     *wp = ((unsigned char *)tmp)[j];
2901     wp++;
2902     }
2903     free(tmp);
2904     }
2905     while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
2906     html++;
2907     }
2908     if(*html == ';') html++;
2909     } else {
2910     hit = FALSE;
2911     for(i = 0; pairs[i] != NULL; i += 2){
2912     if(cbstrfwmatch(html, pairs[i])){
2913     wp += sprintf(wp, "%s", pairs[i+1]);
2914     html += strlen(pairs[i]);
2915     hit = TRUE;
2916     break;
2917     }
2918     }
2919     if(!hit){
2920     *wp = *html;
2921     wp++;
2922     html++;
2923     }
2924     }
2925     } else {
2926     *wp = *html;
2927     wp++;
2928     html++;
2929     }
2930     }
2931     *wp = '\0';
2932     return raw;
2933     }
2934    
2935    
2936     /* create a document object from MIME */
2937     static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang){
2938     ESTDOC *doc, *tdoc;
2939     CBMAP *attrs;
2940     const CBLIST *texts;
2941     CBLIST *parts, *lines;
2942     CBDATUM *datum;
2943     const char *key, *val, *bound, *part, *text, *line;
2944     char *body, *swap, numbuf[NUMBUFSIZ];
2945     int i, j, bsiz, psiz, ssiz, mht;
2946     assert(buf);
2947     doc = est_doc_new();
2948     attrs = cbmapopenex(MINIBNUM);
2949     body = cbmimebreak(buf, size, attrs, &bsiz);
2950     if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){
2951     est_doc_add_attr_mime(doc, ESTDATTRTITLE, val);
2952     if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val);
2953     }
2954     if((val = cbmapget(attrs, "from", -1, NULL)) != NULL)
2955     est_doc_add_attr_mime(doc, ESTDATTRAUTHOR, val);
2956     if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){
2957     est_doc_add_attr_mime(doc, ESTDATTRCDATE, val);
2958     est_doc_add_attr_mime(doc, ESTDATTRMDATE, val);
2959     }
2960     est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822");
2961     sprintf(numbuf, "%d", size);
2962     est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2963     cbmapiterinit(attrs);
2964     while((key = cbmapiternext(attrs, NULL)) != NULL){
2965     if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@') continue;
2966     val = cbmapget(attrs, key, -1, NULL);
2967     est_doc_add_attr_mime(doc, key, val);
2968     }
2969     if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){
2970     mht = cbstrfwimatch(key, "multipart/related");
2971     if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){
2972     parts = cbmimeparts(body, bsiz, bound);
2973     for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){
2974     part = CB_LISTVAL2(parts, i, &psiz);
2975     tdoc = est_doc_new_from_mime(part, psiz, penc, plang);
2976     if(mht){
2977     if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL)
2978     est_doc_add_attr(doc, ESTDATTRTITLE, text);
2979     if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL)
2980     est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
2981     }
2982     texts = est_doc_texts(tdoc);
2983     for(j = 0; j < CB_LISTNUM(texts); j++){
2984     text = CB_LISTVAL(texts, j, NULL);
2985     est_doc_add_text(doc, text);
2986     }
2987     est_doc_delete(tdoc);
2988     }
2989     cblistclose(parts);
2990     }
2991     } else {
2992     if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2993     cbstrfwimatch(key, "base64")){
2994     swap = cbbasedecode(body, &ssiz);
2995     free(body);
2996     body = swap;
2997     bsiz = ssiz;
2998     } else if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2999     cbstrfwimatch(key, "quoted-printable")){
3000     swap = cbquotedecode(body, &ssiz);
3001     free(body);
3002     body = swap;
3003     bsiz = ssiz;
3004     }
3005     if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){
3006     if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){
3007     free(body);
3008     body = swap;
3009     bsiz = ssiz;
3010     } else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL &&
3011     (swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){
3012     free(body);
3013     body = swap;
3014     bsiz = ssiz;
3015     }
3016     lines = cbsplit(body, bsiz, "\n");
3017     datum = cbdatumopen("", 0);
3018     for(i = 0; i < CB_LISTNUM(lines); i++){
3019     line = CB_LISTVAL(lines, i, NULL);
3020     while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){
3021     line++;
3022     }
3023     if(line[0] == '\0'){
3024     est_doc_add_text(doc, CB_DATUMPTR(datum));
3025     cbdatumsetsize(datum, 0);
3026     } else {
3027     cbdatumcat(datum, " ", 1);
3028     cbdatumcat(datum, line, -1);
3029     }
3030     }
3031     est_doc_add_text(doc, CB_DATUMPTR(datum));
3032     cbdatumclose(datum);
3033     cblistclose(lines);
3034     } else if(cbstrfwimatch(key, "text/html")){
3035     tdoc = est_doc_new_from_html(body, bsiz, penc, plang);
3036     if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3037     if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3038     est_doc_add_text(doc, text);
3039     }
3040     if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3041     if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3042     est_doc_add_text(doc, text);
3043     }
3044     texts = est_doc_texts(tdoc);
3045     for(i = 0; i < CB_LISTNUM(texts); i++){
3046     text = CB_LISTVAL(texts, i, NULL);
3047     est_doc_add_text(doc, text);
3048     }
3049     est_doc_delete(tdoc);
3050     } else if(cbstrfwimatch(key, "message/rfc822")){
3051     tdoc = est_doc_new_from_mime(body, bsiz, penc, plang);
3052     if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3053     if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3054     est_doc_add_text(doc, text);
3055     }
3056     if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3057     if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3058     est_doc_add_text(doc, text);
3059     }
3060     texts = est_doc_texts(tdoc);
3061     for(i = 0; i < CB_LISTNUM(texts); i++){
3062     text = CB_LISTVAL(texts, i, NULL);
3063     est_doc_add_text(doc, text);
3064     }
3065     est_doc_delete(tdoc);
3066     } else if(cbstrfwimatch(key, "text/")){
3067     tdoc = est_doc_new_from_text(body, bsiz, penc, plang);
3068     texts = est_doc_texts(tdoc);
3069     for(i = 0; i < CB_LISTNUM(texts); i++){
3070     text = CB_LISTVAL(texts, i, NULL);
3071     est_doc_add_text(doc, text);
3072     }
3073     est_doc_delete(tdoc);
3074     }
3075     }
3076     free(body);
3077     cbmapclose(attrs);
3078     return doc;
3079     }
3080    
3081    
3082     /* set mime value as an attribute of a document */
3083     static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){
3084     char enc[64], *ebuf, *rbuf;
3085     assert(doc && name && value);
3086     ebuf = cbmimedecode(value, enc);
3087     if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){
3088     est_doc_add_attr(doc, name, rbuf);
3089     free(rbuf);
3090     }
3091     free(ebuf);
3092     }
3093    
3094    
3095     /* generate a document with random text */
3096     static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode){
3097     ESTDOC *doc;
3098     char *str;
3099     int i;
3100     doc = est_doc_new();
3101     snum *= pow(est_random_nd() + 0.5, 3.0);
3102     if(mode == RD_RAND){
3103     mode = est_random() * 100;
3104     if(mode < 20){
3105     mode = RD_ENG;
3106     est_doc_add_attr(doc, "mode", "english");
3107     } else if(mode < 40){
3108     mode = RD_LAT;
3109     est_doc_add_attr(doc, "mode", "latin");
3110     } else if(mode < 60){
3111     mode = RD_EURO;
3112     est_doc_add_attr(doc, "mode", "euromix");
3113     } else if(mode < 65){
3114     mode = RD_ORI;
3115     est_doc_add_attr(doc, "mode", "oriental");
3116     } else if(mode < 95){
3117     mode = RD_JPN;
3118     est_doc_add_attr(doc, "mode", "japanese");
3119     } else {
3120     mode = RD_CHAO;
3121     est_doc_add_attr(doc, "mode", "chaos");
3122     }
3123     }
3124     switch(mode){
3125     case RD_ENG: est_doc_add_attr(doc, "mode", "english"); break;
3126     case RD_LAT: est_doc_add_attr(doc, "mode", "latin"); break;
3127     case RD_ORI: est_doc_add_attr(doc, "mode", "oriental"); break;
3128     case RD_JPN: est_doc_add_attr(doc, "mode", "japanese"); break;
3129     case RD_EURO: est_doc_add_attr(doc, "mode", "euromix"); break;
3130     case RD_CHAO: est_doc_add_attr(doc, "mode", "chaos"); break;
3131     }
3132     for(i = 0; i <= snum; i++){
3133     str = est_random_str(cnum, mode);
3134     if(est_random() < 0.05){
3135     est_doc_add_hidden_text(doc, str);
3136     } else {
3137     est_doc_add_text(doc, str);
3138     }
3139     free(str);
3140     }
3141     return doc;
3142     }
3143    
3144    
3145     /* generate random string */
3146     static char *est_random_str(int cnum, int mode){
3147     const char echrs[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
3148     CBDATUM *buf;
3149     char wc[2], *str;
3150     int i, c, wlen, dec, mm, big, n;
3151     buf = cbdatumopen("", 0);
3152     cnum *= pow(est_random_nd() + 0.5, 3.0);
3153     wlen = est_random_nd() * 8 + 4;
3154     dec = (int)(est_random() * INT_MAX) % 10;
3155     big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3156     for(i = 0; i < cnum; i++){
3157     switch(mode){
3158     case RD_ENG: case RD_LAT: case RD_EURO:
3159     mm = (int)(est_random() * INT_MAX) % 100;
3160     if((mode == RD_LAT || mode == RD_EURO) && mm < 5){
3161     c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3162     } else if(mode == RD_EURO && (mm < 30 || dec > 8)){
3163     if(dec % 2 == 0){
3164     c = 0x0391 + (int)(pow(est_random_nd(), 2.0) * (0x03d6 - 0x0391));
3165     } else {
3166     c = 0x0400 + (int)(pow(est_random_nd(), 2.0) * (0x045f - 0x0400));
3167     }
3168     } else if(mm < 95){
3169     if((n = est_random_nd() * (sizeof(echrs) - 1)) == (sizeof(echrs) - 1)) n = 0;
3170     c = echrs[n];
3171     } else {
3172     c = (int)(est_random() * ('@' - ' ')) + ' ';
3173     }
3174     if(--wlen < 1){
3175     c = ' ';
3176     wlen = pow(est_random_nd(), 3.0) * 8 + 4;
3177     dec = (int)(est_random() * INT_MAX) % 10;
3178     }
3179     break;
3180     case RD_ORI:
3181     c = big + est_random_nd() * 0x100;
3182     if(--wlen < 1){
3183     wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3184     big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3185     }
3186     break;
3187     case RD_JPN:
3188     if(dec < 4){
3189     c = 0x3041 + pow(est_random_nd(), 3.0) * (0x3094 - 0x3041);
3190     } else if(dec < 7){
3191     c = 0x30a1 + pow(est_random_nd(), 3.0) * (0x30fe - 0x30a1);
3192     } else if(dec < 9){
3193     c = 0x4e00 + pow(est_random_nd(), 3.0) * (0x9faf - 0x4e00);
3194     } else {
3195     if(est_random() < 0.7){
3196     c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3197     } else {
3198     c = 0x3041 + est_random() * (0xffef - 0x3041);
3199     }
3200     }
3201     if(--wlen < 1){
3202     wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3203     dec = (int)(est_random() * INT_MAX) % 10;
3204     }
3205     break;
3206     default:
3207     if(est_random() < 0.2){
3208     c = 0x00a1 + (int)est_random() * (0x00ff - 0x00a0);
3209     } else {
3210     c = (int)(est_random() * 0x10000);
3211     }
3212     break;
3213     }
3214     if(c <= 0 || c >= 0x10000) c = 0x0020;
3215     wc[0] = c / 0x100;
3216     wc[1] = c % 0x100;
3217     cbdatumcat(buf, wc, 2);
3218     }
3219     str = est_iconv(CB_DATUMPTR(buf), CB_DATUMSIZE(buf), "UTF-16BE", "UTF-8", NULL, NULL);
3220     cbdatumclose(buf);
3221     return str;
3222     }
3223    
3224    
3225    
3226     /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26