1 |
/************************************************************************************************* |
2 |
* The command line interface for the core API |
3 |
* Copyright (C) 2004-2005 Mikio Hirabayashi |
4 |
* This file is part of Hyper Estraier. |
5 |
* Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of |
6 |
* the GNU Lesser General Public License as published by the Free Software Foundation; either |
7 |
* version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope |
8 |
* that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
10 |
* License for more details. |
11 |
* You should have received a copy of the GNU Lesser General Public License along with Hyper |
12 |
* Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
13 |
* Boston, MA 02111-1307 USA. |
14 |
*************************************************************************************************/ |
15 |
|
16 |
|
17 |
#include "estraier.h" |
18 |
#include "myconf.h" |
19 |
|
20 |
#define NUMBUFSIZ 32 /* size of a buffer for a number */ |
21 |
#define URIBUFSIZ 8192 /* size of a buffer for an URI */ |
22 |
#define MINIBNUM 31 /* bucket number of a small map */ |
23 |
#define SEARCHMAX 10 /* maximum number of shown documents */ |
24 |
#define SNIPWWIDTH 480 /* whole width of the snippet */ |
25 |
#define SNIPHWIDTH 96 /* width of beginning of the text */ |
26 |
#define SNIPAWIDTH 96 /* width around each highlighted word */ |
27 |
#define CACHEMAX (512*1024*1024) /* max chache size by mega bytes */ |
28 |
#define DATTRLPATH "_lpath" /* name of the attribute of the local path */ |
29 |
#define DATTRLFILE "_lfile" /* name of the attribute of the local file name */ |
30 |
#define DATTRSCORE "#score" /* name of the pseudo-attribute of score */ |
31 |
#define DATTRKWORDS "#kwords" /* name of the pseudo-attribute of keywords */ |
32 |
#define KWDBNAME "kwords" /* name of the database for keywords */ |
33 |
#define KWDBBNUM 122869 /* bucket number of the keyword database */ |
34 |
#define KWDBDNUM 3 /* division number of the keyword database */ |
35 |
#define KWORDNUM 32 /* number of shown keywords */ |
36 |
#define RDOCSNUM 6 /* number of sections of a raondom document */ |
37 |
#define RDOCCNUM 256 /* number of characters for int a section */ |
38 |
|
39 |
enum { /* enumeration for viewing modes */ |
40 |
VM_ID, /* ID only */ |
41 |
VM_URI, /* ID and URI */ |
42 |
VM_ATTR, /* all attributes */ |
43 |
VM_FULL, /* all attributes and body text */ |
44 |
VM_SNIP, /* all attributes and snippet */ |
45 |
VM_HMRD, /* human readable */ |
46 |
VM_XML, /* XML */ |
47 |
VM_DUMP /* dump draft files */ |
48 |
}; |
49 |
|
50 |
enum { /* enumeration for file formats */ |
51 |
FF_AUTO, /* automatic detection */ |
52 |
FF_DRAFT, /* draft */ |
53 |
FF_TEXT, /* plain text */ |
54 |
FF_HTML, /* HTML */ |
55 |
FF_MIME, /* MIME */ |
56 |
FF_NONE /* ignored */ |
57 |
}; |
58 |
|
59 |
enum { /* enumeration for test documents */ |
60 |
RD_ENG, /* English */ |
61 |
RD_LAT, /* Latin */ |
62 |
RD_EURO, /* European mix */ |
63 |
RD_ORI, /* Oriental */ |
64 |
RD_JPN, /* Japanese */ |
65 |
RD_CHAO, /* chaos */ |
66 |
RD_RAND /* selected at random */ |
67 |
}; |
68 |
|
69 |
|
70 |
/* global variables */ |
71 |
const char *g_progname; /* program name */ |
72 |
int g_sigterm = FALSE; /* flag for termination signal */ |
73 |
int g_putopts = 0; /* options of registration */ |
74 |
int g_outopts = 0; /* options of deletion */ |
75 |
int g_optopts = 0; /* options of optimization */ |
76 |
const char *g_inputcode = "UTF-8"; /* input encoding */ |
77 |
int g_inputlang = ESTLANGEN; /* prefered language */ |
78 |
const char *g_pathcode = NULL; /* path encoding */ |
79 |
int g_pathfull = FALSE; /* whether to record full paths */ |
80 |
int g_oextmodes = 0; /* extra open modes */ |
81 |
int g_viewmode = VM_ID; /* viewing mode */ |
82 |
int g_filefmt = FF_AUTO; /* file format */ |
83 |
CBMAP *g_xcmdmap = NULL; /* map of suffixes and filter commands */ |
84 |
int g_filtorig = FALSE; /* whether to use filter for original files */ |
85 |
int g_stdate = FALSE; /* whether to adopt date by stat */ |
86 |
int g_chkmdate = FALSE; /* whether to check modification date */ |
87 |
double g_cachesize = -1; /* size of the cache */ |
88 |
int g_doforce = FALSE; /* whether to force purging or extracting */ |
89 |
int g_kwordnum = KWORDNUM; /* number of keywords */ |
90 |
int g_condopts = 0; /* options of the search condtion */ |
91 |
int g_rdmode = RD_RAND; /* mode of random documents */ |
92 |
|
93 |
|
94 |
/* function prototypes */ |
95 |
int main(int argc, char **argv); |
96 |
static void printferror(const char *format, ...); |
97 |
static void printfinfo(const char *format, ...); |
98 |
static void dbinform(const char *msg); |
99 |
static void setsignals(void); |
100 |
static void sigtermhandler(int num); |
101 |
static void usage(void); |
102 |
static int runput(int argc, char **argv); |
103 |
static int runout(int argc, char **argv); |
104 |
static int runget(int argc, char **argv); |
105 |
static int runlist(int argc, char **argv); |
106 |
static int runuriid(int argc, char **argv); |
107 |
static int runmeta(int argc, char **argv); |
108 |
static int runinform(int argc, char **argv); |
109 |
static int runoptimize(int argc, char **argv); |
110 |
static int runsearch(int argc, char **argv); |
111 |
static int rungather(int argc, char **argv); |
112 |
static int runpurge(int argc, char **argv); |
113 |
static int runextkeys(int argc, char **argv); |
114 |
static int rundraft(int argc, char **argv); |
115 |
static int runbreak(int argc, char **argv); |
116 |
static int runrandput(int argc, char **argv); |
117 |
static int runwicked(int argc, char **argv); |
118 |
static int runregression(int argc, char **argv); |
119 |
static int procput(const char *dbname, const char *filename); |
120 |
static int procout(const char *dbname, int id, const char *expr); |
121 |
static int procget(const char *dbname, int id, const char *expr, const char *attr); |
122 |
static int proclist(const char *dbname); |
123 |
static int procuriid(const char *dbname, const char *uri); |
124 |
static int procmeta(const char *dbname, const char *mname, const char *mvalue); |
125 |
static int procinform(const char *dbname); |
126 |
static int procoptimize(const char *dbname); |
127 |
static int procsearch(const char *dbname, const char *phrase, |
128 |
const CBLIST *attrs, const char *ord, int max, int sim); |
129 |
static int procgather(const char *dbname, const char *filename); |
130 |
static int procpurge(const char *dbname, const char *prefix); |
131 |
static int procextkeys(const char *dbname, const char *prefix, int ni); |
132 |
static int procdraft(const char *filename); |
133 |
static int procbreak(const char *filename, int wt); |
134 |
static int procrandput(const char *dbname, int dnum); |
135 |
static int procwicked(const char *dbname, int dnum); |
136 |
static int procregression(const char *dbname); |
137 |
static void xmlprintf(const char *format, ...); |
138 |
static int strtolang(const char *str); |
139 |
static char *fgetl(FILE *ifp); |
140 |
static int doputdoc(ESTDB *db, const char *path); |
141 |
static const char *pathtourl(const char *path); |
142 |
static const char *urltofile(const char *uri); |
143 |
static char *urltopath(const char *uri); |
144 |
static CBMAP *vectorizer(void *db, int id, void *kwdb); |
145 |
static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path, |
146 |
const char *xcmd, const char *tmpdir, |
147 |
const char *penc, int plang); |
148 |
static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc); |
149 |
static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang); |
150 |
static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang); |
151 |
static char *est_html_enc(const char *str); |
152 |
static char *est_html_raw_text(const char *html); |
153 |
static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang); |
154 |
static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value); |
155 |
static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode); |
156 |
static char *est_random_str(int cnum, int mode); |
157 |
|
158 |
|
159 |
/* main routine */ |
160 |
int main(int argc, char **argv){ |
161 |
const char *tmp; |
162 |
int rv; |
163 |
if((tmp = getenv("ESTDBGFD")) != NULL) dpdbgfd = atoi(tmp); |
164 |
cbstdiobin(); |
165 |
g_progname = argv[0]; |
166 |
g_sigterm = FALSE; |
167 |
if(argc < 2) usage(); |
168 |
rv = 0; |
169 |
if(!strcmp(argv[1], "put")){ |
170 |
setsignals(); |
171 |
rv = runput(argc, argv); |
172 |
} else if(!strcmp(argv[1], "out")){ |
173 |
setsignals(); |
174 |
rv = runout(argc, argv); |
175 |
} else if(!strcmp(argv[1], "get")){ |
176 |
rv = runget(argc, argv); |
177 |
} else if(!strcmp(argv[1], "list")){ |
178 |
rv = runlist(argc, argv); |
179 |
} else if(!strcmp(argv[1], "uriid")){ |
180 |
rv = runuriid(argc, argv); |
181 |
} else if(!strcmp(argv[1], "meta")){ |
182 |
setsignals(); |
183 |
rv = runmeta(argc, argv); |
184 |
} else if(!strcmp(argv[1], "inform")){ |
185 |
rv = runinform(argc, argv); |
186 |
} else if(!strcmp(argv[1], "optimize")){ |
187 |
setsignals(); |
188 |
rv = runoptimize(argc, argv); |
189 |
} else if(!strcmp(argv[1], "search")){ |
190 |
rv = runsearch(argc, argv); |
191 |
} else if(!strcmp(argv[1], "gather")){ |
192 |
setsignals(); |
193 |
rv = rungather(argc, argv); |
194 |
} else if(!strcmp(argv[1], "purge")){ |
195 |
setsignals(); |
196 |
rv = runpurge(argc, argv); |
197 |
} else if(!strcmp(argv[1], "extkeys")){ |
198 |
setsignals(); |
199 |
rv = runextkeys(argc, argv); |
200 |
} else if(!strcmp(argv[1], "draft")){ |
201 |
rv = rundraft(argc, argv); |
202 |
} else if(!strcmp(argv[1], "break")){ |
203 |
rv = runbreak(argc, argv); |
204 |
} else if(!strcmp(argv[1], "randput")){ |
205 |
setsignals(); |
206 |
rv = runrandput(argc, argv); |
207 |
} else if(!strcmp(argv[1], "wicked")){ |
208 |
setsignals(); |
209 |
rv = runwicked(argc, argv); |
210 |
} else if(!strcmp(argv[1], "regression")){ |
211 |
setsignals(); |
212 |
rv = runregression(argc, argv); |
213 |
} else if(!strcmp(argv[1], "version") || !strcmp(argv[1], "--version")){ |
214 |
printf("Hyper Estraier %s on %s\n", est_version, ESTSYSNAME); |
215 |
printf("Copyright (C) 2004-2005 Mikio Hirabayashi.\n"); |
216 |
rv = 0; |
217 |
} else { |
218 |
usage(); |
219 |
} |
220 |
return rv; |
221 |
} |
222 |
|
223 |
|
224 |
/* print formatted error string and flush the buffer */ |
225 |
static void printferror(const char *format, ...){ |
226 |
va_list ap; |
227 |
va_start(ap, format); |
228 |
fprintf(stderr, "%s: ERROR: ", g_progname); |
229 |
vfprintf(stderr, format, ap); |
230 |
fputc('\n', stderr); |
231 |
fflush(stderr); |
232 |
va_end(ap); |
233 |
} |
234 |
|
235 |
|
236 |
/* print formatted information string and flush the buffer */ |
237 |
static void printfinfo(const char *format, ...){ |
238 |
va_list ap; |
239 |
va_start(ap, format); |
240 |
printf("%s: INFO: ", g_progname); |
241 |
vprintf(format, ap); |
242 |
putchar('\n'); |
243 |
fflush(stdout); |
244 |
va_end(ap); |
245 |
} |
246 |
|
247 |
|
248 |
/* callback function for database events */ |
249 |
static void dbinform(const char *msg){ |
250 |
printfinfo("%s", msg); |
251 |
} |
252 |
|
253 |
|
254 |
/* set signal handlers */ |
255 |
static void setsignals(void){ |
256 |
signal(1, sigtermhandler); |
257 |
signal(2, sigtermhandler); |
258 |
signal(3, sigtermhandler); |
259 |
signal(13, sigtermhandler); |
260 |
signal(15, sigtermhandler); |
261 |
} |
262 |
|
263 |
|
264 |
/* handler of termination signal */ |
265 |
static void sigtermhandler(int num){ |
266 |
static int tries = 0; |
267 |
if(tries++ <= 4){ |
268 |
signal(num, sigtermhandler); |
269 |
} else { |
270 |
signal(num, SIG_DFL); |
271 |
} |
272 |
g_sigterm = TRUE; |
273 |
printfinfo("the termination signal %d catched", num); |
274 |
} |
275 |
|
276 |
|
277 |
/* print the usage and exit */ |
278 |
static void usage(void){ |
279 |
fprintf(stderr, "%s: command line utility for the core API of Hyper Estraier\n", g_progname); |
280 |
fprintf(stderr, "\n"); |
281 |
fprintf(stderr, "usage:\n"); |
282 |
fprintf(stderr, " %s put [-cl] db [file]\n", g_progname); |
283 |
fprintf(stderr, " %s out [-cl] db expr\n", g_progname); |
284 |
fprintf(stderr, " %s get db expr\n", g_progname); |
285 |
fprintf(stderr, " %s list db\n", g_progname); |
286 |
fprintf(stderr, " %s uriid db uri\n", g_progname); |
287 |
fprintf(stderr, " %s meta db [name [value]]\n", g_progname); |
288 |
fprintf(stderr, " %s inform db\n", g_progname); |
289 |
fprintf(stderr, " %s optimize [-onp] [-ond] db\n", g_progname); |
290 |
fprintf(stderr, " %s search [-ic enc] [-vu|-va|-vf|-vs|-vh|-vx|-dd] [-gs|-gf|-ga]" |
291 |
" [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n", |
292 |
g_progname); |
293 |
fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]" |
294 |
" [-ic enc] [-il lang] [-pc enc] [-pf] [-apn] [-sd] [-cm] [-cs num] db [file|dir]\n", |
295 |
g_progname); |
296 |
fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname); |
297 |
fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname); |
298 |
fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname); |
299 |
fprintf(stderr, " %s break [-ic enc] [-il lang] [-apn] [-wt] [file]\n", g_progname); |
300 |
fprintf(stderr, " %s randput [-ren|-rla|-reu|-ror|-rjp|-rch] [-cs num] db dnum\n", |
301 |
g_progname); |
302 |
fprintf(stderr, " %s wicked db dnum\n", g_progname); |
303 |
fprintf(stderr, " %s regression db\n", g_progname); |
304 |
fprintf(stderr, " %s version\n", g_progname); |
305 |
fprintf(stderr, "\n"); |
306 |
exit(1); |
307 |
} |
308 |
|
309 |
|
310 |
/* parse arguments of the put command */ |
311 |
static int runput(int argc, char **argv){ |
312 |
char *dbname, *filename; |
313 |
int i, rv; |
314 |
dbname = NULL; |
315 |
filename = NULL; |
316 |
for(i = 2; i < argc; i++){ |
317 |
if(!dbname && argv[i][0] == '-'){ |
318 |
if(!strcmp(argv[i], "-cl")){ |
319 |
g_putopts |= ESTPDCLEAN; |
320 |
} else { |
321 |
usage(); |
322 |
} |
323 |
} else if(!dbname){ |
324 |
dbname = argv[i]; |
325 |
} else if(!filename){ |
326 |
filename = argv[i]; |
327 |
} else { |
328 |
usage(); |
329 |
} |
330 |
} |
331 |
if(!dbname) usage(); |
332 |
rv = procput(dbname, filename); |
333 |
return rv; |
334 |
} |
335 |
|
336 |
|
337 |
/* parse arguments of the out command */ |
338 |
static int runout(int argc, char **argv){ |
339 |
char *dbname, *expr; |
340 |
int i, id, rv; |
341 |
dbname = NULL; |
342 |
expr = NULL; |
343 |
for(i = 2; i < argc; i++){ |
344 |
if(!dbname && argv[i][0] == '-'){ |
345 |
if(!strcmp(argv[i], "-cl")){ |
346 |
g_outopts |= ESTODCLEAN; |
347 |
} else { |
348 |
usage(); |
349 |
} |
350 |
} else if(!dbname){ |
351 |
dbname = argv[i]; |
352 |
} else if(!expr){ |
353 |
expr = argv[i]; |
354 |
} else { |
355 |
usage(); |
356 |
} |
357 |
} |
358 |
if(!dbname || !expr) usage(); |
359 |
if((id = atoi(expr)) > 0) expr = NULL; |
360 |
rv = procout(dbname, id, expr); |
361 |
return rv; |
362 |
} |
363 |
|
364 |
|
365 |
/* parse arguments of the get command */ |
366 |
static int runget(int argc, char **argv){ |
367 |
char *dbname, *expr, *attr; |
368 |
int i, id, rv; |
369 |
dbname = NULL; |
370 |
expr = NULL; |
371 |
attr = NULL; |
372 |
for(i = 2; i < argc; i++){ |
373 |
if(!dbname && argv[i][0] == '-'){ |
374 |
usage(); |
375 |
} else if(!dbname){ |
376 |
dbname = argv[i]; |
377 |
} else if(!expr){ |
378 |
expr = argv[i]; |
379 |
} else if(!attr){ |
380 |
attr = argv[i]; |
381 |
} else { |
382 |
usage(); |
383 |
} |
384 |
} |
385 |
if(!dbname || !expr) usage(); |
386 |
if((id = atoi(expr)) > 0) expr = NULL; |
387 |
rv = procget(dbname, id, expr, attr); |
388 |
return rv; |
389 |
} |
390 |
|
391 |
|
392 |
/* parse arguments of the list command */ |
393 |
static int runlist(int argc, char **argv){ |
394 |
char *dbname; |
395 |
int i, rv; |
396 |
dbname = NULL; |
397 |
for(i = 2; i < argc; i++){ |
398 |
if(!dbname && argv[i][0] == '-'){ |
399 |
usage(); |
400 |
} else if(!dbname){ |
401 |
dbname = argv[i]; |
402 |
} else { |
403 |
usage(); |
404 |
} |
405 |
} |
406 |
if(!dbname) usage(); |
407 |
rv = proclist(dbname); |
408 |
return rv; |
409 |
} |
410 |
|
411 |
|
412 |
/* parse arguments of the uriid command */ |
413 |
static int runuriid(int argc, char **argv){ |
414 |
char *dbname, *uri; |
415 |
int i, rv; |
416 |
dbname = NULL; |
417 |
uri = NULL; |
418 |
for(i = 2; i < argc; i++){ |
419 |
if(!dbname && argv[i][0] == '-'){ |
420 |
usage(); |
421 |
} else if(!dbname){ |
422 |
dbname = argv[i]; |
423 |
} else if(!uri){ |
424 |
uri = argv[i]; |
425 |
} else { |
426 |
usage(); |
427 |
} |
428 |
} |
429 |
if(!dbname || !uri) usage(); |
430 |
rv = procuriid(dbname, uri); |
431 |
return rv; |
432 |
} |
433 |
|
434 |
|
435 |
/* parse arguments of the meta command */ |
436 |
static int runmeta(int argc, char **argv){ |
437 |
char *dbname, *mname, *mvalue; |
438 |
int i, del, rv; |
439 |
dbname = NULL; |
440 |
mname = NULL; |
441 |
mvalue = NULL; |
442 |
del = FALSE; |
443 |
for(i = 2; i < argc; i++){ |
444 |
if(!dbname && argv[i][0] == '-'){ |
445 |
usage(); |
446 |
} else if(!dbname){ |
447 |
dbname = argv[i]; |
448 |
} else if(!mname){ |
449 |
mname = argv[i]; |
450 |
} else if(!mvalue){ |
451 |
mvalue = argv[i]; |
452 |
} else { |
453 |
usage(); |
454 |
} |
455 |
} |
456 |
if(!dbname) usage(); |
457 |
rv = procmeta(dbname, mname, mvalue); |
458 |
return rv; |
459 |
} |
460 |
|
461 |
|
462 |
/* parse arguments of the inform command */ |
463 |
static int runinform(int argc, char **argv){ |
464 |
char *dbname; |
465 |
int i, rv; |
466 |
dbname = NULL; |
467 |
for(i = 2; i < argc; i++){ |
468 |
if(!dbname && argv[i][0] == '-'){ |
469 |
usage(); |
470 |
} else if(!dbname){ |
471 |
dbname = argv[i]; |
472 |
} else { |
473 |
usage(); |
474 |
} |
475 |
} |
476 |
if(!dbname) usage(); |
477 |
rv = procinform(dbname); |
478 |
return rv; |
479 |
} |
480 |
|
481 |
|
482 |
/* parse arguments of the optimize command */ |
483 |
static int runoptimize(int argc, char **argv){ |
484 |
char *dbname; |
485 |
int i, rv; |
486 |
dbname = NULL; |
487 |
for(i = 2; i < argc; i++){ |
488 |
if(!dbname && argv[i][0] == '-'){ |
489 |
if(!strcmp(argv[i], "-onp")){ |
490 |
g_optopts |= ESTOPTNOPURGE; |
491 |
} else if(!strcmp(argv[i], "-ond")){ |
492 |
g_optopts |= ESTOPTNODBOPT; |
493 |
} else { |
494 |
usage(); |
495 |
} |
496 |
} else if(!dbname){ |
497 |
dbname = argv[i]; |
498 |
} else { |
499 |
usage(); |
500 |
} |
501 |
} |
502 |
if(!dbname) usage(); |
503 |
rv = procoptimize(dbname); |
504 |
return rv; |
505 |
} |
506 |
|
507 |
|
508 |
/* parse arguments of the search command */ |
509 |
static int runsearch(int argc, char **argv){ |
510 |
CBDATUM *pbuf; |
511 |
CBLIST *attrs; |
512 |
char *dbname, *ord, *phrase, *tmp; |
513 |
int i, max, sim, rv; |
514 |
dbname = NULL; |
515 |
ord = NULL; |
516 |
max = SEARCHMAX; |
517 |
sim = -1; |
518 |
pbuf = cbdatumopen("", 0); |
519 |
cbglobalgc(pbuf, (void (*)(void *))cbdatumclose); |
520 |
attrs = cblistopen(); |
521 |
cbglobalgc(attrs, (void (*)(void *))cblistclose); |
522 |
for(i = 2; i < argc; i++){ |
523 |
if(!dbname && argv[i][0] == '-'){ |
524 |
if(!strcmp(argv[i], "-ic")){ |
525 |
if(++i >= argc) usage(); |
526 |
g_inputcode = argv[i]; |
527 |
} else if(!strcmp(argv[i], "-gs")){ |
528 |
g_condopts |= ESTCONDSURE; |
529 |
} else if(!strcmp(argv[i], "-gf")){ |
530 |
g_condopts |= ESTCONDFAST; |
531 |
} else if(!strcmp(argv[i], "-ga")){ |
532 |
g_condopts |= ESTCONDAGIT; |
533 |
} else if(!strcmp(argv[i], "-ni")){ |
534 |
g_condopts |= ESTCONDNOIDF; |
535 |
} else if(!strcmp(argv[i], "-sf")){ |
536 |
g_condopts |= ESTCONDSIMPLE; |
537 |
} else if(!strcmp(argv[i], "-hs")){ |
538 |
g_condopts |= ESTCONDSCFB; |
539 |
} else if(!strcmp(argv[i], "-vu")){ |
540 |
g_viewmode = VM_URI; |
541 |
} else if(!strcmp(argv[i], "-va")){ |
542 |
g_viewmode = VM_ATTR; |
543 |
} else if(!strcmp(argv[i], "-vf")){ |
544 |
g_viewmode = VM_FULL; |
545 |
} else if(!strcmp(argv[i], "-vs")){ |
546 |
g_viewmode = VM_SNIP; |
547 |
} else if(!strcmp(argv[i], "-vh")){ |
548 |
g_viewmode = VM_HMRD; |
549 |
} else if(!strcmp(argv[i], "-vx")){ |
550 |
g_viewmode = VM_XML; |
551 |
} else if(!strcmp(argv[i], "-dd")){ |
552 |
g_viewmode = VM_DUMP; |
553 |
} else if(!strcmp(argv[i], "-attr")){ |
554 |
if(++i >= argc) usage(); |
555 |
cblistpush(attrs, argv[i], -1); |
556 |
} else if(!strcmp(argv[i], "-ord")){ |
557 |
if(++i >= argc) usage(); |
558 |
ord = argv[i]; |
559 |
} else if(!strcmp(argv[i], "-max")){ |
560 |
if(++i >= argc) usage(); |
561 |
max = atoi(argv[i]); |
562 |
} else if(!strcmp(argv[i], "-sim")){ |
563 |
if(++i >= argc) usage(); |
564 |
sim = atoi(argv[i]); |
565 |
} else { |
566 |
usage(); |
567 |
} |
568 |
} else if(!dbname){ |
569 |
dbname = argv[i]; |
570 |
} else { |
571 |
if(cbdatumsize(pbuf) > 0) cbdatumcat(pbuf, " ", 1); |
572 |
cbdatumcat(pbuf, argv[i], -1); |
573 |
} |
574 |
} |
575 |
if(!dbname) usage(); |
576 |
if(!(phrase = est_iconv(cbdatumptr(pbuf), -1, g_inputcode, "UTF-8", NULL, NULL))){ |
577 |
printferror("%s: unsupported encoding\n", g_inputcode); |
578 |
return 1; |
579 |
} |
580 |
cbstrtrim(phrase); |
581 |
for(i = 0; i < cblistnum(attrs); i++){ |
582 |
if((tmp = est_iconv(cblistval(attrs, i, NULL), -1, g_inputcode, "UTF-8", NULL, NULL)) != NULL){ |
583 |
cblistover(attrs, i, tmp, -1); |
584 |
free(tmp); |
585 |
} |
586 |
} |
587 |
rv = procsearch(dbname, phrase, attrs, ord, max, sim); |
588 |
free(phrase); |
589 |
return rv; |
590 |
} |
591 |
|
592 |
|
593 |
/* parse arguments of the gather command */ |
594 |
static int rungather(int argc, char **argv){ |
595 |
CBLIST *list; |
596 |
const char *elem; |
597 |
char *dbname, *filename; |
598 |
int i, j, rv; |
599 |
g_xcmdmap = cbmapopenex(MINIBNUM); |
600 |
cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose); |
601 |
dbname = NULL; |
602 |
filename = NULL; |
603 |
g_inputcode = NULL; |
604 |
for(i = 2; i < argc; i++){ |
605 |
if(!dbname && argv[i][0] == '-'){ |
606 |
if(!strcmp(argv[i], "-cl")){ |
607 |
g_putopts |= ESTPDCLEAN; |
608 |
} else if(!strcmp(argv[i], "-fe")){ |
609 |
g_filefmt = FF_DRAFT; |
610 |
} else if(!strcmp(argv[i], "-ft")){ |
611 |
g_filefmt = FF_TEXT; |
612 |
} else if(!strcmp(argv[i], "-fh")){ |
613 |
g_filefmt = FF_HTML; |
614 |
} else if(!strcmp(argv[i], "-fm")){ |
615 |
g_filefmt = FF_MIME; |
616 |
} else if(!strcmp(argv[i], "-fx")){ |
617 |
if((i += 2) >= argc) usage(); |
618 |
list = cbsplit(argv[i-1], -1, ","); |
619 |
for(j = 0; j < cblistnum(list); j++){ |
620 |
elem = cblistval(list, j, NULL); |
621 |
if(elem[0] != '\0') cbmapput(g_xcmdmap, elem, -1, argv[i], -1, FALSE); |
622 |
} |
623 |
cblistclose(list); |
624 |
} else if(!strcmp(argv[i], "-fz")){ |
625 |
g_filefmt = FF_NONE; |
626 |
} else if(!strcmp(argv[i], "-fo")){ |
627 |
g_filtorig = TRUE; |
628 |
} else if(!strcmp(argv[i], "-ic")){ |
629 |
if(++i >= argc) usage(); |
630 |
g_inputcode = argv[i]; |
631 |
} else if(!strcmp(argv[i], "-il")){ |
632 |
if(++i >= argc) usage(); |
633 |
g_inputlang = strtolang(argv[i]); |
634 |
} else if(!strcmp(argv[i], "-pc")){ |
635 |
if(++i >= argc) usage(); |
636 |
g_pathcode = argv[i]; |
637 |
} else if(!strcmp(argv[i], "-pf")){ |
638 |
g_pathfull = TRUE; |
639 |
} else if(!strcmp(argv[i], "-apn")){ |
640 |
g_oextmodes |= ESTDBPERFNG; |
641 |
} else if(!strcmp(argv[i], "-sd")){ |
642 |
g_stdate = TRUE; |
643 |
} else if(!strcmp(argv[i], "-cm")){ |
644 |
g_chkmdate = TRUE; |
645 |
} else if(!strcmp(argv[i], "-cs")){ |
646 |
if(++i >= argc) usage(); |
647 |
g_cachesize = strtod(argv[i], NULL) * 1024 * 1024; |
648 |
} else { |
649 |
usage(); |
650 |
} |
651 |
} else if(!dbname){ |
652 |
dbname = argv[i]; |
653 |
} else if(!filename){ |
654 |
filename = argv[i]; |
655 |
} else { |
656 |
usage(); |
657 |
} |
658 |
} |
659 |
if(!dbname || !filename) usage(); |
660 |
rv = procgather(dbname, filename); |
661 |
return rv; |
662 |
} |
663 |
|
664 |
|
665 |
/* parse arguments of the purge command */ |
666 |
static int runpurge(int argc, char **argv){ |
667 |
char *dbname, *prefix; |
668 |
int i, rv; |
669 |
dbname = NULL; |
670 |
prefix = NULL; |
671 |
for(i = 2; i < argc; i++){ |
672 |
if(!dbname && argv[i][0] == '-'){ |
673 |
if(!strcmp(argv[i], "-cl")){ |
674 |
g_outopts |= ESTODCLEAN; |
675 |
} else if(!strcmp(argv[i], "-fc")){ |
676 |
g_doforce = TRUE; |
677 |
} else { |
678 |
usage(); |
679 |
} |
680 |
} else if(!dbname){ |
681 |
dbname = argv[i]; |
682 |
} else if(!prefix){ |
683 |
prefix = argv[i]; |
684 |
} else { |
685 |
usage(); |
686 |
} |
687 |
} |
688 |
if(!dbname) usage(); |
689 |
rv = procpurge(dbname, prefix); |
690 |
return rv; |
691 |
} |
692 |
|
693 |
|
694 |
/* parse arguments of the extkeys command */ |
695 |
static int runextkeys(int argc, char **argv){ |
696 |
char *dbname, *prefix; |
697 |
int i, ni, rv; |
698 |
dbname = NULL; |
699 |
prefix = NULL; |
700 |
ni = FALSE; |
701 |
for(i = 2; i < argc; i++){ |
702 |
if(!dbname && argv[i][0] == '-'){ |
703 |
if(!strcmp(argv[i], "-fc")){ |
704 |
g_doforce = TRUE; |
705 |
} else if(!strcmp(argv[i], "-ni")){ |
706 |
ni = TRUE; |
707 |
} else if(!strcmp(argv[i], "-kn")){ |
708 |
if(++i >= argc) usage(); |
709 |
g_kwordnum = atoi(argv[i]); |
710 |
} else { |
711 |
usage(); |
712 |
} |
713 |
} else if(!dbname){ |
714 |
dbname = argv[i]; |
715 |
} else if(!prefix){ |
716 |
prefix = argv[i]; |
717 |
} else { |
718 |
usage(); |
719 |
} |
720 |
} |
721 |
if(!dbname || g_kwordnum < 1) usage(); |
722 |
rv = procextkeys(dbname, prefix, ni); |
723 |
return rv; |
724 |
} |
725 |
|
726 |
|
727 |
/* parse arguments of the draft command */ |
728 |
static int rundraft(int argc, char **argv){ |
729 |
char *filename; |
730 |
int i, rv; |
731 |
filename = NULL; |
732 |
g_filefmt = FF_DRAFT; |
733 |
g_inputcode = NULL; |
734 |
for(i = 2; i < argc; i++){ |
735 |
if(!filename && argv[i][0] == '-'){ |
736 |
if(!strcmp(argv[i], "-ft")){ |
737 |
g_filefmt = FF_TEXT; |
738 |
} else if(!strcmp(argv[i], "-fh")){ |
739 |
g_filefmt = FF_HTML; |
740 |
} else if(!strcmp(argv[i], "-fm")){ |
741 |
g_filefmt = FF_MIME; |
742 |
} else if(!strcmp(argv[i], "-ic")){ |
743 |
if(++i >= argc) usage(); |
744 |
g_inputcode = argv[i]; |
745 |
} else if(!strcmp(argv[i], "-il")){ |
746 |
if(++i >= argc) usage(); |
747 |
g_inputlang = strtolang(argv[i]); |
748 |
} else { |
749 |
usage(); |
750 |
} |
751 |
} else if(!filename){ |
752 |
filename = argv[i]; |
753 |
} else { |
754 |
usage(); |
755 |
} |
756 |
} |
757 |
rv = procdraft(filename); |
758 |
return rv; |
759 |
} |
760 |
|
761 |
|
762 |
/* parse arguments of the break command */ |
763 |
static int runbreak(int argc, char **argv){ |
764 |
char *filename; |
765 |
int i, wt, rv; |
766 |
filename = NULL; |
767 |
wt = FALSE; |
768 |
for(i = 2; i < argc; i++){ |
769 |
if(!filename && argv[i][0] == '-'){ |
770 |
if(!strcmp(argv[i], "-ic")){ |
771 |
if(++i >= argc) usage(); |
772 |
g_inputcode = argv[i]; |
773 |
} else if(!strcmp(argv[i], "-il")){ |
774 |
if(++i >= argc) usage(); |
775 |
g_inputlang = strtolang(argv[i]); |
776 |
} else if(!strcmp(argv[i], "-apn")){ |
777 |
g_oextmodes |= ESTDBPERFNG; |
778 |
} else if(!strcmp(argv[i], "-wt")){ |
779 |
wt = TRUE; |
780 |
} else { |
781 |
usage(); |
782 |
} |
783 |
} else if(!filename){ |
784 |
filename = argv[i]; |
785 |
} else { |
786 |
usage(); |
787 |
} |
788 |
} |
789 |
rv = procbreak(filename, wt); |
790 |
return rv; |
791 |
} |
792 |
|
793 |
|
794 |
/* parse arguments of the randput command */ |
795 |
static int runrandput(int argc, char **argv){ |
796 |
char *dbname, *dnstr; |
797 |
int i, dnum, rv; |
798 |
dbname = NULL; |
799 |
dnstr = NULL; |
800 |
for(i = 2; i < argc; i++){ |
801 |
if(!dbname && argv[i][0] == '-'){ |
802 |
if(!strcmp(argv[i], "-ren")){ |
803 |
g_rdmode = RD_ENG; |
804 |
} else if(!strcmp(argv[i], "-rla")){ |
805 |
g_rdmode = RD_LAT; |
806 |
} else if(!strcmp(argv[i], "-reu")){ |
807 |
g_rdmode = RD_EURO; |
808 |
} else if(!strcmp(argv[i], "-ror")){ |
809 |
g_rdmode = RD_ORI; |
810 |
} else if(!strcmp(argv[i], "-rjp")){ |
811 |
g_rdmode = RD_JPN; |
812 |
} else if(!strcmp(argv[i], "-rch")){ |
813 |
g_rdmode = RD_CHAO; |
814 |
} else if(!strcmp(argv[i], "-cs")){ |
815 |
if(++i >= argc) usage(); |
816 |
g_cachesize = strtod(argv[i], NULL) * 1024 * 1024; |
817 |
} else { |
818 |
usage(); |
819 |
} |
820 |
} else if(!dbname){ |
821 |
dbname = argv[i]; |
822 |
} else if(!dnstr){ |
823 |
dnstr = argv[i]; |
824 |
} else { |
825 |
usage(); |
826 |
} |
827 |
} |
828 |
if(!dbname || !dnstr) usage(); |
829 |
if((dnum = atoi(dnstr)) < 1) usage(); |
830 |
rv = procrandput(dbname, dnum); |
831 |
return rv; |
832 |
} |
833 |
|
834 |
|
835 |
/* parse arguments of the wicked command */ |
836 |
static int runwicked(int argc, char **argv){ |
837 |
char *dbname, *dnstr; |
838 |
int i, dnum, rv; |
839 |
dbname = NULL; |
840 |
dnstr = NULL; |
841 |
for(i = 2; i < argc; i++){ |
842 |
if(!dbname && argv[i][0] == '-'){ |
843 |
usage(); |
844 |
} else if(!dbname){ |
845 |
dbname = argv[i]; |
846 |
} else if(!dnstr){ |
847 |
dnstr = argv[i]; |
848 |
} else { |
849 |
usage(); |
850 |
} |
851 |
} |
852 |
if(!dbname || !dnstr) usage(); |
853 |
if((dnum = atoi(dnstr)) < 1) usage(); |
854 |
rv = procwicked(dbname, dnum); |
855 |
return rv; |
856 |
} |
857 |
|
858 |
|
859 |
/* parse arguments of the regression command */ |
860 |
static int runregression(int argc, char **argv){ |
861 |
char *dbname; |
862 |
int i, rv; |
863 |
dbname = NULL; |
864 |
for(i = 2; i < argc; i++){ |
865 |
if(!dbname && argv[i][0] == '-'){ |
866 |
usage(); |
867 |
} else if(!dbname){ |
868 |
dbname = argv[i]; |
869 |
} else { |
870 |
usage(); |
871 |
} |
872 |
} |
873 |
if(!dbname) usage(); |
874 |
rv = procregression(dbname); |
875 |
return rv; |
876 |
} |
877 |
|
878 |
|
879 |
/* perform the put command */ |
880 |
static int procput(const char *dbname, const char *filename){ |
881 |
ESTDB *db; |
882 |
ESTDOC *doc; |
883 |
const char *uri; |
884 |
char *draft; |
885 |
int ecode; |
886 |
if(!(draft = cbreadfile(filename, NULL))){ |
887 |
printferror("%s: could not open", filename ? filename : "(stdin)"); |
888 |
return 1; |
889 |
} |
890 |
if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT, &ecode))){ |
891 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
892 |
free(draft); |
893 |
return 1; |
894 |
} |
895 |
est_db_set_informer(db, dbinform); |
896 |
doc = est_doc_new_from_draft(draft); |
897 |
if(!est_db_put_doc(db, doc, g_putopts)){ |
898 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
899 |
est_doc_delete(doc); |
900 |
est_db_close(db, &ecode); |
901 |
free(draft); |
902 |
return 1; |
903 |
} |
904 |
if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = ""; |
905 |
printfinfo("%d (%s): registered", est_doc_id(doc), uri); |
906 |
est_doc_delete(doc); |
907 |
if(!est_db_close(db, &ecode)){ |
908 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
909 |
free(draft); |
910 |
return 1; |
911 |
} |
912 |
free(draft); |
913 |
return 0; |
914 |
} |
915 |
|
916 |
|
917 |
/* perform the out command */ |
918 |
static int procout(const char *dbname, int id, const char *expr){ |
919 |
ESTDB *db; |
920 |
int ecode; |
921 |
if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){ |
922 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
923 |
return 1; |
924 |
} |
925 |
est_db_set_informer(db, dbinform); |
926 |
if(expr && (id = est_db_uri_to_id(db, expr)) < 1){ |
927 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
928 |
est_db_close(db, &ecode); |
929 |
return 1; |
930 |
} |
931 |
if(!est_db_out_doc(db, id, g_outopts)){ |
932 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
933 |
est_db_close(db, &ecode); |
934 |
return 1; |
935 |
} |
936 |
printfinfo("%d: deleted", id); |
937 |
if(!est_db_close(db, &ecode)){ |
938 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
939 |
return 1; |
940 |
} |
941 |
return 0; |
942 |
} |
943 |
|
944 |
|
945 |
/* perform the get command */ |
946 |
static int procget(const char *dbname, int id, const char *expr, const char *attr){ |
947 |
ESTDB *db; |
948 |
ESTDOC *doc; |
949 |
char *draft; |
950 |
int ecode; |
951 |
if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){ |
952 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
953 |
return 1; |
954 |
} |
955 |
if(expr && (id = est_db_uri_to_id(db, expr)) < 1){ |
956 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
957 |
est_db_close(db, &ecode); |
958 |
return 1; |
959 |
} |
960 |
if(attr){ |
961 |
if(!(draft = est_db_get_doc_attr(db, id, attr))){ |
962 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
963 |
est_db_close(db, &ecode); |
964 |
return 1; |
965 |
} |
966 |
printf("%s\n", draft); |
967 |
free(draft); |
968 |
} else { |
969 |
if(!(doc = est_db_get_doc(db, id, 0))){ |
970 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
971 |
est_db_close(db, &ecode); |
972 |
return 1; |
973 |
} |
974 |
draft = est_doc_dump_draft(doc); |
975 |
printf("%s", draft); |
976 |
free(draft); |
977 |
est_doc_delete(doc); |
978 |
} |
979 |
if(!est_db_close(db, &ecode)){ |
980 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
981 |
return 1; |
982 |
} |
983 |
return 0; |
984 |
} |
985 |
|
986 |
|
987 |
/* perform the list command */ |
988 |
static int proclist(const char *dbname){ |
989 |
ESTDB *db; |
990 |
ESTDOC *doc; |
991 |
const char *vbuf; |
992 |
int ecode, id; |
993 |
if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){ |
994 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
995 |
return 1; |
996 |
} |
997 |
if(!est_db_iter_init(db)){ |
998 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
999 |
est_db_close(db, &ecode); |
1000 |
return 1; |
1001 |
} |
1002 |
while((id = est_db_iter_next(db)) > 0){ |
1003 |
if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){ |
1004 |
if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = ""; |
1005 |
printf("%d\t%s\n", id, vbuf); |
1006 |
est_doc_delete(doc); |
1007 |
} |
1008 |
} |
1009 |
if(!est_db_close(db, &ecode)){ |
1010 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1011 |
return 1; |
1012 |
} |
1013 |
return 0; |
1014 |
} |
1015 |
|
1016 |
|
1017 |
/* perform the uriid command */ |
1018 |
static int procuriid(const char *dbname, const char *uri){ |
1019 |
ESTDB *db; |
1020 |
int ecode, id; |
1021 |
if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){ |
1022 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1023 |
return 1; |
1024 |
} |
1025 |
if((id = est_db_uri_to_id(db, uri)) == -1){ |
1026 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
1027 |
est_db_close(db, &ecode); |
1028 |
return 1; |
1029 |
} |
1030 |
printf("%d\n", id); |
1031 |
if(!est_db_close(db, &ecode)){ |
1032 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1033 |
return 1; |
1034 |
} |
1035 |
return 0; |
1036 |
} |
1037 |
|
1038 |
|
1039 |
/* perform the meta command */ |
1040 |
static int procmeta(const char *dbname, const char *mname, const char *mvalue){ |
1041 |
ESTDB *db; |
1042 |
CBLIST *names; |
1043 |
char *vbuf; |
1044 |
int i, ecode; |
1045 |
if(!(db = est_db_open(dbname, mvalue ? (ESTDBWRITER | ESTDBCREAT) : (ESTDBREADER | ESTDBLCKNB), |
1046 |
&ecode))){ |
1047 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1048 |
return 1; |
1049 |
} |
1050 |
if(mname){ |
1051 |
if(mvalue){ |
1052 |
est_db_add_meta(db, mname, mvalue[0] != '\0' ? mvalue : NULL); |
1053 |
} else { |
1054 |
if((vbuf = est_db_meta(db, mname)) != NULL){ |
1055 |
printf("%s\n", vbuf); |
1056 |
free(vbuf); |
1057 |
} |
1058 |
} |
1059 |
} else { |
1060 |
names = est_db_meta_names(db); |
1061 |
for(i = 0; i < cblistnum(names); i++){ |
1062 |
printf("%s\n", cblistval(names, i, NULL)); |
1063 |
} |
1064 |
cblistclose(names); |
1065 |
} |
1066 |
if(!est_db_close(db, &ecode)){ |
1067 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1068 |
return 1; |
1069 |
} |
1070 |
return 0; |
1071 |
} |
1072 |
|
1073 |
|
1074 |
/* perform the inform command */ |
1075 |
static int procinform(const char *dbname){ |
1076 |
ESTDB *db; |
1077 |
int ecode; |
1078 |
if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){ |
1079 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1080 |
return 1; |
1081 |
} |
1082 |
printf("number of documents: %d\n", est_db_doc_num(db)); |
1083 |
printf("number of words: %d\n", est_db_word_num(db)); |
1084 |
printf("file size: %.0f\n", est_db_size(db)); |
1085 |
if(!est_db_close(db, &ecode)){ |
1086 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1087 |
return 1; |
1088 |
} |
1089 |
return 0; |
1090 |
} |
1091 |
|
1092 |
|
1093 |
/* perform the optimize command */ |
1094 |
static int procoptimize(const char *dbname){ |
1095 |
ESTDB *db; |
1096 |
char path[URIBUFSIZ]; |
1097 |
int ecode; |
1098 |
time_t curtime; |
1099 |
curtime = time(NULL); |
1100 |
if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){ |
1101 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1102 |
return 1; |
1103 |
} |
1104 |
est_db_set_informer(db, dbinform); |
1105 |
sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME); |
1106 |
unlink(path); |
1107 |
if(!est_db_optimize(db, g_optopts)){ |
1108 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
1109 |
est_db_close(db, &ecode); |
1110 |
return 1; |
1111 |
} |
1112 |
if(!est_db_close(db, &ecode)){ |
1113 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1114 |
return 1; |
1115 |
} |
1116 |
curtime = time(NULL) - curtime; |
1117 |
printfinfo("finished successfully: elapsed time: %dh %dm %ds", |
1118 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
1119 |
return 0; |
1120 |
} |
1121 |
|
1122 |
|
1123 |
/* perform the search command */ |
1124 |
static int procsearch(const char *dbname, const char *phrase, |
1125 |
const CBLIST *attrs, const char *ord, int max, int sim){ |
1126 |
ESTDB *db; |
1127 |
ESTCOND *cond; |
1128 |
ESTDOC *doc; |
1129 |
CURIA *kwdb; |
1130 |
CBDATUM *pbuf; |
1131 |
CBMAP *svmap, *hints, *kwords; |
1132 |
CBLIST *names, *words, *lines; |
1133 |
const char *kbuf, *vbuf, *line; |
1134 |
char *draft, path[URIBUFSIZ], numbuf[NUMBUFSIZ], *word, *pv; |
1135 |
int i, j, ecode, ksiz, vsiz, *res, rnum, id, sc, fin, cnt; |
1136 |
double curtime; |
1137 |
if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){ |
1138 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1139 |
return 1; |
1140 |
} |
1141 |
sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME); |
1142 |
if((kwdb = cropen(path, CR_OREADER, -1, -1)) != NULL) |
1143 |
est_db_set_vectorizer(db, vectorizer, kwdb); |
1144 |
cond = est_cond_new(); |
1145 |
if(sim > 0){ |
1146 |
svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL; |
1147 |
if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){ |
1148 |
svmap = est_db_etch_doc((g_condopts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM); |
1149 |
est_doc_delete(doc); |
1150 |
} |
1151 |
if(svmap){ |
1152 |
pbuf = cbdatumopen(ESTOPSIMILAR, -1); |
1153 |
cbmapiterinit(svmap); |
1154 |
while((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){ |
1155 |
vbuf = cbmapget(svmap, kbuf, ksiz, &vsiz); |
1156 |
cbdatumcat(pbuf, " WITH ", -1); |
1157 |
cbdatumcat(pbuf, vbuf, vsiz); |
1158 |
cbdatumcat(pbuf, " ", 1); |
1159 |
cbdatumcat(pbuf, kbuf, ksiz); |
1160 |
} |
1161 |
est_cond_set_phrase(cond, cbdatumptr(pbuf)); |
1162 |
cbdatumclose(pbuf); |
1163 |
cbmapclose(svmap); |
1164 |
} |
1165 |
} else { |
1166 |
while(*phrase > '\0' && *phrase <= ' '){ |
1167 |
phrase++; |
1168 |
} |
1169 |
if(phrase[0] != '\0' || cblistnum(attrs) < 1) est_cond_set_phrase(cond, phrase); |
1170 |
} |
1171 |
for(i = 0; i < cblistnum(attrs); i++){ |
1172 |
est_cond_add_attr(cond, cblistval(attrs, i, NULL)); |
1173 |
} |
1174 |
if(ord) est_cond_set_order(cond, ord); |
1175 |
if(max >= 0) est_cond_set_max(cond, max); |
1176 |
est_cond_set_options(cond, g_condopts); |
1177 |
hints = cbmapopenex(MINIBNUM); |
1178 |
curtime = est_gettimeofday(); |
1179 |
res = est_db_search(db, cond, &rnum, hints); |
1180 |
curtime = est_gettimeofday() - curtime; |
1181 |
if(g_viewmode == VM_XML){ |
1182 |
xmlprintf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); |
1183 |
xmlprintf("<estresult version=\"%@\">\n", est_version); |
1184 |
xmlprintf("<meta>\n"); |
1185 |
xmlprintf("<hit number=\"%@\"/>\n", cbmapget(hints, "", 0, NULL)); |
1186 |
cbmapiterinit(hints); |
1187 |
while((kbuf = cbmapiternext(hints, NULL)) != NULL){ |
1188 |
if(kbuf[0] == '\0') continue; |
1189 |
vbuf = cbmapget(hints, kbuf, -1, NULL); |
1190 |
xmlprintf("<hit key=\"%@\" number=\"%@\"/>\n", kbuf, vbuf); |
1191 |
} |
1192 |
xmlprintf("<time time=\"%.3f\"/>\n", curtime / 1000.0); |
1193 |
xmlprintf("<total documents=\"%d\" words=\"%d\"/>\n", |
1194 |
est_db_doc_num(db), est_db_word_num(db)); |
1195 |
xmlprintf("</meta>\n"); |
1196 |
} else { |
1197 |
printf("%s\n", est_border_str()); |
1198 |
printf("VERSION\t%s\n", _EST_PROTVER); |
1199 |
printf("NODE\tlocal\n"); |
1200 |
printf("HIT\t%s\n", cbmapget(hints, "", 0, NULL)); |
1201 |
cbmapiterinit(hints); |
1202 |
cnt = 1; |
1203 |
while((kbuf = cbmapiternext(hints, NULL)) != NULL){ |
1204 |
if(kbuf[0] == '\0') continue; |
1205 |
vbuf = cbmapget(hints, kbuf, -1, NULL); |
1206 |
printf("HINT#%d\t%s\t%s\n", cnt, kbuf, vbuf); |
1207 |
cnt++; |
1208 |
} |
1209 |
printf("TIME\t%.3f\n", curtime / 1000.0); |
1210 |
printf("DOCNUM\t%d\n", est_db_doc_num(db)); |
1211 |
printf("WORDNUM\t%d\n", est_db_word_num(db)); |
1212 |
switch(g_viewmode){ |
1213 |
case VM_ID: |
1214 |
printf("VIEW\tID\n"); |
1215 |
break; |
1216 |
case VM_URI: |
1217 |
printf("VIEW\tURI\n"); |
1218 |
break; |
1219 |
case VM_ATTR: |
1220 |
printf("VIEW\tATTRIBUTE\n"); |
1221 |
break; |
1222 |
case VM_FULL: |
1223 |
printf("VIEW\tFULL\n"); |
1224 |
break; |
1225 |
case VM_SNIP: |
1226 |
printf("VIEW\tSNIPPET\n"); |
1227 |
break; |
1228 |
case VM_HMRD: |
1229 |
printf("VIEW\tHUMAN\n"); |
1230 |
break; |
1231 |
} |
1232 |
printf("\n"); |
1233 |
if(g_viewmode == VM_ID || g_viewmode == VM_URI || |
1234 |
g_viewmode == VM_HMRD || g_viewmode == VM_DUMP) printf("%s\n", est_border_str()); |
1235 |
} |
1236 |
for(i = 0; i < rnum ; i++){ |
1237 |
id = res[i]; |
1238 |
sc = est_cond_score(cond, i); |
1239 |
switch(g_viewmode){ |
1240 |
case VM_URI: |
1241 |
if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){ |
1242 |
if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = ""; |
1243 |
printf("%d\t%s\n", id, vbuf); |
1244 |
est_doc_delete(doc); |
1245 |
} |
1246 |
break; |
1247 |
case VM_ATTR: |
1248 |
if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){ |
1249 |
if(sc >= 0){ |
1250 |
sprintf(numbuf, "%d", sc); |
1251 |
est_doc_add_attr(doc, DATTRSCORE, numbuf); |
1252 |
} |
1253 |
printf("%s\n", est_border_str()); |
1254 |
names = est_doc_attr_names(doc); |
1255 |
for(j = 0; j < cblistnum(names); j++){ |
1256 |
kbuf = cblistval(names, j, NULL); |
1257 |
vbuf = est_doc_attr(doc, kbuf); |
1258 |
printf("%s=%s\n", kbuf, vbuf); |
1259 |
} |
1260 |
cblistclose(names); |
1261 |
est_doc_delete(doc); |
1262 |
} |
1263 |
printf("\n"); |
1264 |
break; |
1265 |
case VM_FULL: |
1266 |
if((doc = est_db_get_doc(db, id, 0)) != NULL){ |
1267 |
if(sc >= 0){ |
1268 |
sprintf(numbuf, "%d", sc); |
1269 |
est_doc_add_attr(doc, DATTRSCORE, numbuf); |
1270 |
} |
1271 |
printf("%s\n", est_border_str()); |
1272 |
draft = est_doc_dump_draft(doc); |
1273 |
printf("%s", draft); |
1274 |
free(draft); |
1275 |
est_doc_delete(doc); |
1276 |
} |
1277 |
break; |
1278 |
case VM_SNIP: |
1279 |
if((doc = est_db_get_doc(db, id, 0)) != NULL){ |
1280 |
if(sc >= 0){ |
1281 |
sprintf(numbuf, "%d", sc); |
1282 |
est_doc_add_attr(doc, DATTRSCORE, numbuf); |
1283 |
} |
1284 |
printf("%s\n", est_border_str()); |
1285 |
names = est_doc_attr_names(doc); |
1286 |
for(j = 0; j < cblistnum(names); j++){ |
1287 |
kbuf = cblistval(names, j, NULL); |
1288 |
vbuf = est_doc_attr(doc, kbuf); |
1289 |
printf("%s=%s\n", kbuf, vbuf); |
1290 |
} |
1291 |
cblistclose(names); |
1292 |
kwords = kwdb ? vectorizer(db, id, kwdb) : NULL; |
1293 |
if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM); |
1294 |
if(cbmaprnum(kwords) > 0){ |
1295 |
printf("%s=", DATTRKWORDS); |
1296 |
cbmapiterinit(kwords); |
1297 |
for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){ |
1298 |
if(j > 0) printf(" "); |
1299 |
printf("%s %s", kbuf, cbmapget(kwords, kbuf, -1, NULL)); |
1300 |
} |
1301 |
printf("\n"); |
1302 |
} |
1303 |
cbmapclose(kwords); |
1304 |
printf("\n"); |
1305 |
words = cbmapkeys(hints); |
1306 |
draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH); |
1307 |
printf("%s", draft); |
1308 |
free(draft); |
1309 |
cblistclose(words); |
1310 |
est_doc_delete(doc); |
1311 |
} |
1312 |
break; |
1313 |
case VM_HMRD: |
1314 |
if((doc = est_db_get_doc(db, id, 0)) != NULL){ |
1315 |
if(sc >= 0){ |
1316 |
sprintf(numbuf, "%d", sc); |
1317 |
est_doc_add_attr(doc, DATTRSCORE, numbuf); |
1318 |
} |
1319 |
printf("\n"); |
1320 |
if((vbuf = est_doc_attr(doc, ESTDATTRURI)) != NULL) printf("URI: %s\n", vbuf); |
1321 |
if((vbuf = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) printf("Title: %s\n", vbuf); |
1322 |
printf(" "); |
1323 |
words = cbmapkeys(hints); |
1324 |
draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH); |
1325 |
lines = cbsplit(draft, -1, "\n"); |
1326 |
fin = TRUE; |
1327 |
for(j = 0; j < cblistnum(lines); j++){ |
1328 |
line = cblistval(lines, j, NULL); |
1329 |
if(line[0] != '\0'){ |
1330 |
word = cbmemdup(line, -1); |
1331 |
if((pv = strchr(word, '\t')) != NULL) *pv = '\0'; |
1332 |
printf("%s", word); |
1333 |
free(word); |
1334 |
fin = TRUE; |
1335 |
} else if(fin){ |
1336 |
printf(" ... "); |
1337 |
fin = FALSE; |
1338 |
} |
1339 |
} |
1340 |
cblistclose(lines); |
1341 |
free(draft); |
1342 |
cblistclose(words); |
1343 |
printf("\n\n"); |
1344 |
est_doc_delete(doc); |
1345 |
} |
1346 |
break; |
1347 |
case VM_XML: |
1348 |
if((doc = est_db_get_doc(db, id, 0)) != NULL){ |
1349 |
if(sc >= 0){ |
1350 |
sprintf(numbuf, "%d", sc); |
1351 |
est_doc_add_attr(doc, DATTRSCORE, numbuf); |
1352 |
} |
1353 |
if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = ""; |
1354 |
xmlprintf("<document id=\"%d\" uri=\"%@\">\n", id, vbuf); |
1355 |
names = est_doc_attr_names(doc); |
1356 |
for(j = 0; j < cblistnum(names); j++){ |
1357 |
kbuf = cblistval(names, j, NULL); |
1358 |
if(!strcmp(kbuf, ESTDATTRID) || !strcmp(kbuf, ESTDATTRURI)) continue; |
1359 |
vbuf = est_doc_attr(doc, kbuf); |
1360 |
xmlprintf("<attribute name=\"%@\" value=\"%@\"/>\n", kbuf, vbuf); |
1361 |
} |
1362 |
cblistclose(names); |
1363 |
kwords = kwdb ? vectorizer(db, id, kwdb) : NULL; |
1364 |
if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM); |
1365 |
if(cbmaprnum(kwords) > 0){ |
1366 |
xmlprintf("<vector>"); |
1367 |
cbmapiterinit(kwords); |
1368 |
for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){ |
1369 |
xmlprintf("<element key=\"%@\" number=\"%@\"/>", |
1370 |
kbuf, cbmapget(kwords, kbuf, -1, NULL)); |
1371 |
} |
1372 |
xmlprintf("</vector>\n"); |
1373 |
} |
1374 |
cbmapclose(kwords); |
1375 |
words = cbmapkeys(hints); |
1376 |
draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH); |
1377 |
lines = cbsplit(draft, -1, "\n"); |
1378 |
fin = TRUE; |
1379 |
xmlprintf("<snippet>"); |
1380 |
for(j = 0; j < cblistnum(lines); j++){ |
1381 |
line = cblistval(lines, j, NULL); |
1382 |
if(line[0] != '\0'){ |
1383 |
word = cbmemdup(line, -1); |
1384 |
if((pv = strchr(word, '\t')) != NULL){ |
1385 |
*pv = '\0'; |
1386 |
pv++; |
1387 |
xmlprintf("<key normal=\"%@\">%@</key>", pv, word); |
1388 |
} else { |
1389 |
xmlprintf("%@", word); |
1390 |
} |
1391 |
free(word); |
1392 |
fin = TRUE; |
1393 |
} else if(fin){ |
1394 |
xmlprintf("<delimiter/>"); |
1395 |
fin = FALSE; |
1396 |
} |
1397 |
} |
1398 |
xmlprintf("</snippet>\n"); |
1399 |
cblistclose(lines); |
1400 |
free(draft); |
1401 |
cblistclose(words); |
1402 |
xmlprintf("</document>\n"); |
1403 |
est_doc_delete(doc); |
1404 |
} |
1405 |
break; |
1406 |
case VM_DUMP: |
1407 |
if((doc = est_db_get_doc(db, id, 0)) != NULL){ |
1408 |
if(sc >= 0){ |
1409 |
sprintf(numbuf, "%d", sc); |
1410 |
est_doc_add_attr(doc, DATTRSCORE, numbuf); |
1411 |
} |
1412 |
if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = ""; |
1413 |
sprintf(path, "%08d%cest", id, ESTEXTCHR); |
1414 |
printf("%s\t%s\n", path, vbuf); |
1415 |
draft = est_doc_dump_draft(doc); |
1416 |
if(!(cbwritefile(path, draft, -1))) printferror("%s: could not open", path); |
1417 |
free(draft); |
1418 |
est_doc_delete(doc); |
1419 |
} |
1420 |
break; |
1421 |
default: |
1422 |
printf("%d\n", id); |
1423 |
break; |
1424 |
} |
1425 |
} |
1426 |
if(g_viewmode == VM_XML){ |
1427 |
xmlprintf("</estresult>\n"); |
1428 |
} else { |
1429 |
printf("%s:END\n", est_border_str()); |
1430 |
} |
1431 |
free(res); |
1432 |
cbmapclose(hints); |
1433 |
est_cond_delete(cond); |
1434 |
if(kwdb) crclose(kwdb); |
1435 |
if(!est_db_close(db, &ecode)){ |
1436 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1437 |
return 1; |
1438 |
} |
1439 |
return 0; |
1440 |
} |
1441 |
|
1442 |
|
1443 |
/* perform the gather command */ |
1444 |
static int procgather(const char *dbname, const char *filename){ |
1445 |
ESTDB *db; |
1446 |
CBLIST *list, *clist; |
1447 |
FILE *ifp; |
1448 |
const char *tmp; |
1449 |
char *line, *path; |
1450 |
int i, err, ecode; |
1451 |
time_t curtime; |
1452 |
struct stat sbuf; |
1453 |
curtime = time(NULL); |
1454 |
err = FALSE; |
1455 |
if(stat(filename, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)){ |
1456 |
printfinfo("reading list from the directory: %s", filename); |
1457 |
if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){ |
1458 |
est_db_set_informer(db, dbinform); |
1459 |
if(g_cachesize > 0){ |
1460 |
if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX; |
1461 |
est_db_set_cache_size(db, g_cachesize, -1, -1); |
1462 |
} |
1463 |
list = cblistopen(); |
1464 |
cblistunshift(list, filename, -1); |
1465 |
while((line = cblistshift(list, NULL)) != NULL){ |
1466 |
if(stat(line, &sbuf) != -1 && S_ISDIR(sbuf.st_mode) && (clist = cbdirlist(line)) != NULL){ |
1467 |
cblistsort(clist); |
1468 |
for(i = cblistnum(clist) - 1; i >= 0; i--){ |
1469 |
tmp = cblistval(clist, i, NULL); |
1470 |
if(!strcmp(tmp, ESTCDIRSTR) || !strcmp(tmp, ESTPDIRSTR)) continue; |
1471 |
path = cbsprintf("%s%c%s", line, ESTPATHCHR, tmp); |
1472 |
cblistunshift(list, path, -1); |
1473 |
free(path); |
1474 |
} |
1475 |
cblistclose(clist); |
1476 |
} else { |
1477 |
if(!doputdoc(db, line)){ |
1478 |
printferror("%s: %s", line, est_err_msg(est_db_error(db))); |
1479 |
err = TRUE; |
1480 |
} |
1481 |
} |
1482 |
free(line); |
1483 |
if(err || g_sigterm) break; |
1484 |
} |
1485 |
cblistclose(list); |
1486 |
if(!est_db_close(db, &ecode)){ |
1487 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1488 |
err = TRUE; |
1489 |
} |
1490 |
} else { |
1491 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1492 |
err = TRUE; |
1493 |
} |
1494 |
} else { |
1495 |
if(!strcmp(filename, "-")){ |
1496 |
ifp = stdin; |
1497 |
printfinfo("reading list from the standard input", filename); |
1498 |
} else if((ifp = fopen(filename, "rb")) != NULL){ |
1499 |
printfinfo("reading list from the file: %s", filename); |
1500 |
} else { |
1501 |
printferror("%s: could not open", filename); |
1502 |
return 1; |
1503 |
} |
1504 |
if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){ |
1505 |
est_db_set_informer(db, dbinform); |
1506 |
if(g_cachesize > 0){ |
1507 |
if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX; |
1508 |
est_db_set_cache_size(db, g_cachesize, -1, -1); |
1509 |
} |
1510 |
while((line = fgetl(ifp)) != NULL){ |
1511 |
if(!doputdoc(db, line)){ |
1512 |
printferror("%s: %s", line, est_err_msg(est_db_error(db))); |
1513 |
err = TRUE; |
1514 |
} |
1515 |
free(line); |
1516 |
if(err || g_sigterm) break; |
1517 |
} |
1518 |
if(!est_db_close(db, &ecode)){ |
1519 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1520 |
err = TRUE; |
1521 |
} |
1522 |
} else { |
1523 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1524 |
err = TRUE; |
1525 |
} |
1526 |
if(ifp != stdin) fclose(ifp); |
1527 |
} |
1528 |
curtime = time(NULL) - curtime; |
1529 |
if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds", |
1530 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
1531 |
return err ? 1 : 0; |
1532 |
} |
1533 |
|
1534 |
|
1535 |
/* perform the purge command */ |
1536 |
static int procpurge(const char *dbname, const char *prefix){ |
1537 |
ESTDB *db; |
1538 |
ESTCOND *cond; |
1539 |
ESTDOC *doc; |
1540 |
const char *luri; |
1541 |
char *attr, *path; |
1542 |
int i, ecode, err, *res, rnum; |
1543 |
time_t curtime; |
1544 |
struct stat sbuf; |
1545 |
curtime = time(NULL); |
1546 |
if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){ |
1547 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1548 |
return 1; |
1549 |
} |
1550 |
est_db_set_informer(db, dbinform); |
1551 |
cond = est_cond_new(); |
1552 |
attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : ""); |
1553 |
est_cond_add_attr(cond, attr); |
1554 |
res = est_db_search(db, cond, &rnum, NULL); |
1555 |
err = FALSE; |
1556 |
for(i = 0; i < rnum; i++){ |
1557 |
if(!(doc = est_db_get_doc(db, res[i], ESTGDNOTEXT))) continue; |
1558 |
if((luri = est_doc_attr(doc, DATTRLPATH)) != NULL){ |
1559 |
if(g_doforce){ |
1560 |
if(est_db_out_doc(db, res[i], g_outopts)){ |
1561 |
printfinfo("%d (%s): deleted", res[i], luri); |
1562 |
} else { |
1563 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
1564 |
err = TRUE; |
1565 |
} |
1566 |
} else if((path = urltopath(luri)) != NULL){ |
1567 |
if(stat(path, &sbuf) != -1){ |
1568 |
printfinfo("%s: passed", luri); |
1569 |
} else { |
1570 |
if(est_db_out_doc(db, res[i], g_outopts)){ |
1571 |
printfinfo("%d (%s): deleted", res[i], luri); |
1572 |
} else { |
1573 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
1574 |
err = TRUE; |
1575 |
} |
1576 |
} |
1577 |
} else { |
1578 |
printfinfo("%s: ignored", luri); |
1579 |
} |
1580 |
} else { |
1581 |
printfinfo("(%d): ignored", res[i]); |
1582 |
} |
1583 |
est_doc_delete(doc); |
1584 |
if(err || g_sigterm) break; |
1585 |
} |
1586 |
free(res); |
1587 |
est_cond_delete(cond); |
1588 |
free(attr); |
1589 |
if(!est_db_close(db, &ecode)){ |
1590 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1591 |
return 1; |
1592 |
} |
1593 |
curtime = time(NULL) - curtime; |
1594 |
if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds", |
1595 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
1596 |
return err ? 1 : 0; |
1597 |
} |
1598 |
|
1599 |
|
1600 |
/* perform the extkeys command */ |
1601 |
static int procextkeys(const char *dbname, const char *prefix, int ni){ |
1602 |
ESTDB *db; |
1603 |
ESTCOND *cond; |
1604 |
ESTDOC *doc; |
1605 |
CURIA *kwdb; |
1606 |
CBMAP *kwords; |
1607 |
const char *uri; |
1608 |
char path[URIBUFSIZ], *attr, *mbuf; |
1609 |
int i, ecode, err, *res, rnum, msiz; |
1610 |
time_t curtime; |
1611 |
curtime = time(NULL); |
1612 |
if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){ |
1613 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1614 |
return 1; |
1615 |
} |
1616 |
est_db_set_informer(db, dbinform); |
1617 |
if(!ni && (!prefix || prefix[0] == '\0')) est_db_fill_key_cache(db); |
1618 |
sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME); |
1619 |
if(!(kwdb = cropen(path, CR_OWRITER | CR_OCREAT, KWDBBNUM, KWDBDNUM))){ |
1620 |
printferror("%s: the keyword database has some errors", dbname); |
1621 |
est_db_close(db, &ecode); |
1622 |
return 1; |
1623 |
} |
1624 |
crsetalign(kwdb, -4); |
1625 |
cond = est_cond_new(); |
1626 |
attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : ""); |
1627 |
est_cond_add_attr(cond, attr); |
1628 |
res = est_db_search(db, cond, &rnum, NULL); |
1629 |
err = FALSE; |
1630 |
for(i = 0; i < rnum; i++){ |
1631 |
if(!g_doforce && crvsiz(kwdb, (char *)&(res[i]), sizeof(int)) > 0){ |
1632 |
printfinfo("%d: passed", res[i]); |
1633 |
continue; |
1634 |
} |
1635 |
if(!(doc = est_db_get_doc(db, res[i], 0))) continue; |
1636 |
if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = ""; |
1637 |
kwords = est_db_etch_doc(ni ? NULL : db, doc, g_kwordnum); |
1638 |
mbuf = cbmapdump(kwords, &msiz); |
1639 |
fflush(stdout); |
1640 |
if(crput(kwdb, (char *)&(res[i]), sizeof(int), mbuf, msiz, CR_DOVER)){ |
1641 |
printfinfo("%d (%s): extracted", res[i], uri); |
1642 |
} else { |
1643 |
printferror("%s: the keyword database has some errors", dbname); |
1644 |
err = TRUE; |
1645 |
} |
1646 |
free(mbuf); |
1647 |
cbmapclose(kwords); |
1648 |
est_doc_delete(doc); |
1649 |
if(err || g_sigterm) break; |
1650 |
} |
1651 |
free(res); |
1652 |
est_cond_delete(cond); |
1653 |
free(attr); |
1654 |
if(!crclose(kwdb)){ |
1655 |
printferror("%s: the keyword database has some errors", dbname); |
1656 |
err = TRUE; |
1657 |
} |
1658 |
if(!est_db_close(db, &ecode)){ |
1659 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1660 |
return 1; |
1661 |
} |
1662 |
curtime = time(NULL) - curtime; |
1663 |
if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds", |
1664 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
1665 |
return err ? 1 : 0; |
1666 |
} |
1667 |
|
1668 |
|
1669 |
/* perform the draft command */ |
1670 |
static int procdraft(const char *filename){ |
1671 |
ESTDOC *doc; |
1672 |
char *buf, *draft; |
1673 |
int size; |
1674 |
if(!(buf = cbreadfile(filename, &size))){ |
1675 |
printferror("%s: could not open", filename ? filename : "(stdin)"); |
1676 |
return 1; |
1677 |
} |
1678 |
switch(g_filefmt){ |
1679 |
case FF_TEXT: |
1680 |
doc = est_doc_new_from_text(buf, size, g_inputcode, g_inputlang); |
1681 |
break; |
1682 |
case FF_HTML: |
1683 |
doc = est_doc_new_from_html(buf, size, g_inputcode, g_inputlang); |
1684 |
break; |
1685 |
case FF_MIME: |
1686 |
doc = est_doc_new_from_mime(buf, size, g_inputcode, g_inputlang); |
1687 |
break; |
1688 |
default: |
1689 |
doc = est_doc_new_from_draft_enc(buf, size, g_inputcode); |
1690 |
break; |
1691 |
} |
1692 |
draft = est_doc_dump_draft(doc); |
1693 |
printf("%s", draft); |
1694 |
free(draft); |
1695 |
est_doc_delete(doc); |
1696 |
free(buf); |
1697 |
return 0; |
1698 |
} |
1699 |
|
1700 |
|
1701 |
/* perform the break command */ |
1702 |
static int procbreak(const char *filename, int wt){ |
1703 |
CBLIST *words; |
1704 |
char *str, *phrase; |
1705 |
int i; |
1706 |
if(filename && filename[0] == '@'){ |
1707 |
str = cbmemdup(filename + 1, -1); |
1708 |
} else if(!(str = cbreadfile(filename, NULL))){ |
1709 |
printferror("%s: could not open", filename ? filename : "(stdin)"); |
1710 |
return 1; |
1711 |
} |
1712 |
if(!(phrase = est_iconv(str, -1, g_inputcode, "UTF-8", NULL, NULL))){ |
1713 |
printferror("%s: unsupported encoding\n", g_inputcode); |
1714 |
free(str); |
1715 |
return 1; |
1716 |
} |
1717 |
g_inputcode = NULL; |
1718 |
words = cblistopen(); |
1719 |
if(g_oextmodes & ESTDBPERFNG){ |
1720 |
est_break_text_perfng(phrase, words, TRUE, wt); |
1721 |
} else { |
1722 |
est_break_text(phrase, words, TRUE, wt); |
1723 |
} |
1724 |
for(i = 0; i < cblistnum(words); i++){ |
1725 |
printf("%s\n", cblistval(words, i, NULL)); |
1726 |
} |
1727 |
cblistclose(words); |
1728 |
free(phrase); |
1729 |
free(str); |
1730 |
return 0; |
1731 |
} |
1732 |
|
1733 |
|
1734 |
/* perform the randput command */ |
1735 |
static int procrandput(const char *dbname, int dnum){ |
1736 |
ESTDB *db; |
1737 |
ESTDOC *doc; |
1738 |
const char *mode; |
1739 |
char uri[URIBUFSIZ]; |
1740 |
int i, ecode, err; |
1741 |
time_t curtime; |
1742 |
curtime = time(NULL); |
1743 |
if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){ |
1744 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1745 |
return 1; |
1746 |
} |
1747 |
est_db_set_informer(db, dbinform); |
1748 |
if(g_cachesize > 0){ |
1749 |
if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX; |
1750 |
est_db_set_cache_size(db, g_cachesize, -1, -1); |
1751 |
} |
1752 |
err = FALSE; |
1753 |
for(i = 0; i < dnum; i++){ |
1754 |
doc = est_doc_new_from_chaos(RDOCCNUM, RDOCSNUM, g_rdmode); |
1755 |
sprintf(uri, "file:///tmp/randput-%08d-%05d.est", i + 1, getpid()); |
1756 |
est_doc_add_attr(doc, ESTDATTRURI, uri); |
1757 |
if(est_db_put_doc(db, doc, 0)){ |
1758 |
if(!(mode = est_doc_attr(doc, "mode"))) mode = "unknown"; |
1759 |
printfinfo("%d (%s) (%s): registered", est_doc_id(doc), uri, mode); |
1760 |
} else { |
1761 |
printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
1762 |
err = TRUE; |
1763 |
} |
1764 |
est_doc_delete(doc); |
1765 |
if(err || g_sigterm) break; |
1766 |
} |
1767 |
if(!est_db_close(db, &ecode)){ |
1768 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1769 |
return 1; |
1770 |
} |
1771 |
curtime = time(NULL) - curtime; |
1772 |
if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds", |
1773 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
1774 |
return err ? 1 : 0; |
1775 |
} |
1776 |
|
1777 |
|
1778 |
/* perform the wicked command */ |
1779 |
static int procwicked(const char *dbname, int dnum){ |
1780 |
ESTDB *db; |
1781 |
ESTDOC *doc; |
1782 |
ESTCOND *cond; |
1783 |
CBLIST *words; |
1784 |
char uri[URIBUFSIZ], *oper, *value, *first, *second, *phrase; |
1785 |
int i, j, ecode, err, *res, rnum; |
1786 |
double rnd; |
1787 |
time_t curtime; |
1788 |
curtime = time(NULL); |
1789 |
if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){ |
1790 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1791 |
return 1; |
1792 |
} |
1793 |
est_db_set_informer(db, dbinform); |
1794 |
est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256); |
1795 |
est_db_set_special_cache(db, ESTDATTRURI, 128); |
1796 |
err = FALSE; |
1797 |
for(i = 0; i < dnum; i++){ |
1798 |
rnd = est_random(); |
1799 |
if((int)(rnd * INT_MAX) % dnum < 5){ |
1800 |
rnd = est_random(); |
1801 |
if(rnd < 0.3){ |
1802 |
if(!est_db_close(db, &ecode)){ |
1803 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1804 |
return 1; |
1805 |
} |
1806 |
if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){ |
1807 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1808 |
return 1; |
1809 |
} |
1810 |
est_db_set_informer(db, dbinform); |
1811 |
est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256); |
1812 |
est_db_set_special_cache(db, ESTDATTRURI, i / 10 + 1); |
1813 |
} else if(rnd < 0.5){ |
1814 |
if(!est_db_optimize(db, (int)(est_random() * INT_MAX) % 2 == 0) ? ESTOPTNOPURGE : 0) |
1815 |
err = TRUE; |
1816 |
} else if(rnd < 0.8){ |
1817 |
if(!est_db_flush(db, 1024)) err = TRUE; |
1818 |
} else { |
1819 |
if(!est_db_sync(db)) err = TRUE; |
1820 |
} |
1821 |
} else if(rnd < 0.05){ |
1822 |
if(est_db_out_doc(db, (int)(est_random() * INT_MAX) % (i + 1) + 1, |
1823 |
((int)(est_random() * INT_MAX) % 2 == 0) ? ESTODCLEAN : 0)){ |
1824 |
printfinfo("[%d:%d]: out", i + 1, est_db_doc_num(db)); |
1825 |
} else if(est_db_error(db) != ESTENOITEM){ |
1826 |
err = TRUE; |
1827 |
} |
1828 |
} else if(rnd < 0.1){ |
1829 |
if((value = est_db_get_doc_attr(db, (int)(est_random() * INT_MAX) % (i + 1) + 1, |
1830 |
ESTDATTRURI)) != NULL){ |
1831 |
printfinfo("[%d:%d]: attr: %s", i + 1, est_db_doc_num(db), value); |
1832 |
free(value); |
1833 |
} |
1834 |
} else if(rnd < 0.25){ |
1835 |
rnd = est_random(); |
1836 |
if(rnd < 0.5){ |
1837 |
oper = " OR "; |
1838 |
} else if(rnd < 0.7){ |
1839 |
oper = " AND "; |
1840 |
} else if(rnd < 0.8){ |
1841 |
oper = " NOTAND "; |
1842 |
} else if(rnd < 0.9){ |
1843 |
oper = " "; |
1844 |
} else { |
1845 |
oper = ""; |
1846 |
} |
1847 |
first = est_random_str(5, (int)(est_random() * INT_MAX) % RD_RAND); |
1848 |
second = est_random_str(2, (int)(est_random() * INT_MAX) % RD_RAND); |
1849 |
phrase = cbsprintf("%s%s%s", first, oper, second); |
1850 |
cond = est_cond_new(); |
1851 |
est_cond_set_phrase(cond, phrase); |
1852 |
if(est_random() < 0.25) est_cond_add_attr(cond, "@uri STREW 0.est"); |
1853 |
if(est_random() < 0.25) est_cond_set_order(cond, "@uri STRD"); |
1854 |
if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDSURE | ESTCONDSCFB); |
1855 |
if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDAGIT | ESTCONDNOIDF); |
1856 |
res = est_db_search(db, cond, &rnum, NULL); |
1857 |
printfinfo("[%d:%d]: search: %d hits", i + 1, est_db_doc_num(db), rnum); |
1858 |
if(est_random() < 0.05){ |
1859 |
for(j = 0; j < rnum && j < 100; j++){ |
1860 |
if((doc = est_db_get_doc(db, res[j], 0)) != NULL){ |
1861 |
if(i % 10 == 0){ |
1862 |
free(est_doc_cat_texts(doc)); |
1863 |
free(est_doc_dump_draft(doc)); |
1864 |
words = cblistopen(); |
1865 |
cblistpush(words, "vw", -1); |
1866 |
cblistpush(words, "xy", -1); |
1867 |
cblistpush(words, "z", -1); |
1868 |
free(est_doc_make_snippet(doc, words, 100, 10, 10)); |
1869 |
cblistclose(words); |
1870 |
} |
1871 |
est_doc_delete(doc); |
1872 |
} else if(est_db_error(db) != ESTENOITEM){ |
1873 |
err = TRUE; |
1874 |
} |
1875 |
} |
1876 |
} |
1877 |
free(res); |
1878 |
est_cond_delete(cond); |
1879 |
free(phrase); |
1880 |
free(first); |
1881 |
free(second); |
1882 |
} else { |
1883 |
doc = est_doc_new_from_chaos(100, 3, est_random() < 0.5 ? RD_EURO : RD_RAND); |
1884 |
if(est_random() < 0.2){ |
1885 |
sprintf(uri, "file:///tmp/wicked-%08d-%05d.est", |
1886 |
(int)(est_random() * INT_MAX) % (i + 1) + 1, getpid()); |
1887 |
} else { |
1888 |
sprintf(uri, "file:///tmp/wicked-%08d-%05d.est", i + 1, getpid()); |
1889 |
} |
1890 |
est_doc_add_attr(doc, ESTDATTRURI, uri); |
1891 |
if(!est_db_put_doc(db, doc, est_random() < 0.5 ? ESTPDCLEAN : 0)) err = TRUE; |
1892 |
est_doc_delete(doc); |
1893 |
} |
1894 |
if(err || g_sigterm) break; |
1895 |
} |
1896 |
if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
1897 |
if(!est_db_close(db, &ecode)){ |
1898 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1899 |
return 1; |
1900 |
} |
1901 |
curtime = time(NULL) - curtime; |
1902 |
if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds", |
1903 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
1904 |
return err ? 1 : 0; |
1905 |
} |
1906 |
|
1907 |
|
1908 |
/* perform the regression command */ |
1909 |
static int procregression(const char *dbname){ |
1910 |
ESTDB *db; |
1911 |
ESTDOC *doc; |
1912 |
ESTCOND *cond; |
1913 |
int i, ecode, err, *res, rnum; |
1914 |
time_t curtime; |
1915 |
curtime = time(NULL); |
1916 |
printfinfo("# opening the database"); |
1917 |
if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){ |
1918 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
1919 |
return 1; |
1920 |
} |
1921 |
est_db_set_informer(db, dbinform); |
1922 |
err = FALSE; |
1923 |
if(!err){ |
1924 |
printfinfo("# checking registration of small documents"); |
1925 |
doc = est_doc_new(); |
1926 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///small/one"); |
1927 |
est_doc_add_text(doc, "One!"); |
1928 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1929 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1930 |
est_doc_delete(doc); |
1931 |
doc = est_doc_new(); |
1932 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///small/two"); |
1933 |
est_doc_add_text(doc, "Two!!"); |
1934 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1935 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1936 |
est_doc_delete(doc); |
1937 |
doc = est_doc_new(); |
1938 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///small/three"); |
1939 |
est_doc_add_text(doc, "Three!!!"); |
1940 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1941 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1942 |
est_doc_delete(doc); |
1943 |
doc = est_doc_new(); |
1944 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///empty"); |
1945 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1946 |
est_doc_delete(doc); |
1947 |
} |
1948 |
if(!err){ |
1949 |
printfinfo("# checking registration of an english document"); |
1950 |
doc = est_doc_new(); |
1951 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///english"); |
1952 |
est_doc_add_attr(doc, ESTDATTRTITLE, "Hyper Estraier"); |
1953 |
est_doc_add_text(doc, "% This is a displayed sentence. ;-)"); |
1954 |
est_doc_add_text(doc, "Hyper Estraier is a full-text search system for communities."); |
1955 |
est_doc_add_text(doc, "A little suffering is good for the soul."); |
1956 |
est_doc_add_text(doc, "They have been at a great feast of languages, and stolen the scraps."); |
1957 |
est_doc_add_hidden_text(doc, "(Give it up, Yo! Give it up, Yo!)"); |
1958 |
est_doc_add_hidden_text(doc, "% This is a hidden sentence. :-<"); |
1959 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1960 |
est_doc_add_hidden_text(doc, ""); |
1961 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1962 |
est_doc_delete(doc); |
1963 |
} |
1964 |
if(!err){ |
1965 |
printfinfo("# checking registration of a japanese document"); |
1966 |
doc = est_doc_new(); |
1967 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///japanese"); |
1968 |
est_doc_add_attr(doc, ESTDATTRTITLE, "\xe5\xb9\xb3\xe6\x9e\x97\xe5\xb9\xb9\xe9\x9b\x84"); |
1969 |
est_doc_add_text(doc, "\xe6\x9c\xac\xe6\x97\xa5\xe3\x81\xaf\xe6\x99\xb4\xe5\xa4\xa9\xe3" |
1970 |
"\x81\xaa\xe3\x82\x8a\xe3\x80\x82"); |
1971 |
est_doc_add_text(doc, "\xe6\x9c\x95\xe3\x81\xaf\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4" |
1972 |
"\xb8\x80\xe3\x81\xae\xe4\xb8\x8b\xe5\x83\x95\xe3\x81\xa7\xe3\x81" |
1973 |
"\x82\xe3\x82\x8b\xe3\x80\x82"); |
1974 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1975 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1976 |
est_doc_delete(doc); |
1977 |
} |
1978 |
if(!err){ |
1979 |
printfinfo("# checking duplication of documents"); |
1980 |
doc = est_doc_new(); |
1981 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication"); |
1982 |
est_doc_add_text(doc, "Gamble, you gatta chance to make a Rumble!"); |
1983 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1984 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1985 |
est_doc_delete(doc); |
1986 |
doc = est_doc_new(); |
1987 |
est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication"); |
1988 |
est_doc_add_text(doc, "bring back hey, one more time!"); |
1989 |
est_doc_add_hidden_text(doc, "(Check it out, come on!)"); |
1990 |
if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE; |
1991 |
est_doc_delete(doc); |
1992 |
if(est_db_doc_num(db) != 7){ |
1993 |
printferror("%s: the number of documents is invalid", dbname); |
1994 |
err = TRUE; |
1995 |
} |
1996 |
} |
1997 |
if(!err){ |
1998 |
printfinfo("# checking search for unfixed documents"); |
1999 |
cond = est_cond_new(); |
2000 |
est_cond_set_phrase(cond, "check"); |
2001 |
res = est_db_search(db, cond, &rnum, NULL); |
2002 |
if(rnum != 6){ |
2003 |
printferror("%s: the number of result is invalid", dbname); |
2004 |
err = TRUE; |
2005 |
} |
2006 |
free(res); |
2007 |
est_cond_delete(cond); |
2008 |
} |
2009 |
if(!err){ |
2010 |
printfinfo("# checking partial flushing of the index"); |
2011 |
if(!est_db_flush(db, 32)) err = TRUE; |
2012 |
} |
2013 |
if(!err){ |
2014 |
printfinfo("# checking deletion with cleaning of a document"); |
2015 |
if(!est_db_out_doc(db, 1, ESTODCLEAN)) err = TRUE; |
2016 |
} |
2017 |
if(!err){ |
2018 |
printfinfo("# checking synchronization"); |
2019 |
if(!est_db_sync(db)) err = TRUE; |
2020 |
} |
2021 |
if(!err){ |
2022 |
printfinfo("# checking deletion without cleaning of a document"); |
2023 |
if(!est_db_out_doc(db, 2, 0)) err = TRUE; |
2024 |
} |
2025 |
if(!err){ |
2026 |
printfinfo("# checking word search"); |
2027 |
cond = est_cond_new(); |
2028 |
est_cond_set_phrase(cond, "check it AND on"); |
2029 |
res = est_db_search(db, cond, &rnum, NULL); |
2030 |
if(rnum != 5){ |
2031 |
printferror("%s: the number of result is invalid", dbname); |
2032 |
err = TRUE; |
2033 |
} |
2034 |
free(res); |
2035 |
est_cond_set_phrase(cond, "RUMBLE OR \xe3\x80\x82"); |
2036 |
res = est_db_search(db, cond, &rnum, NULL); |
2037 |
if(rnum != 1){ |
2038 |
printferror("%s: the number of result is invalid", dbname); |
2039 |
err = TRUE; |
2040 |
} |
2041 |
free(res); |
2042 |
est_cond_delete(cond); |
2043 |
} |
2044 |
if(!err){ |
2045 |
printfinfo("# checking attribute search"); |
2046 |
cond = est_cond_new(); |
2047 |
est_cond_add_attr(cond, "@uri !ISTRINC SMaLl"); |
2048 |
res = est_db_search(db, cond, &rnum, NULL); |
2049 |
if(rnum != est_db_doc_num(db) - 1){ |
2050 |
printferror("%s: the number of result is invalid", dbname); |
2051 |
err = TRUE; |
2052 |
} |
2053 |
free(res); |
2054 |
est_cond_delete(cond); |
2055 |
cond = est_cond_new(); |
2056 |
est_cond_add_attr(cond, "@uri STRBW file://"); |
2057 |
est_cond_add_attr(cond, "@title STRINC \xe5\xb9\xb3"); |
2058 |
res = est_db_search(db, cond, &rnum, NULL); |
2059 |
if(rnum != 1){ |
2060 |
printferror("%s: the number of result is invalid", dbname); |
2061 |
err = TRUE; |
2062 |
} |
2063 |
free(res); |
2064 |
est_cond_delete(cond); |
2065 |
} |
2066 |
if(!err){ |
2067 |
printfinfo("# checking combined search"); |
2068 |
cond = est_cond_new(); |
2069 |
est_cond_set_phrase(cond, "\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4\xb8\x80"); |
2070 |
est_cond_add_attr(cond, "@uri"); |
2071 |
est_cond_set_order(cond, "@title"); |
2072 |
res = est_db_search(db, cond, &rnum, NULL); |
2073 |
if(rnum != 1){ |
2074 |
printferror("%s: the number of result is invalid", dbname); |
2075 |
err = TRUE; |
2076 |
} |
2077 |
free(res); |
2078 |
est_cond_delete(cond); |
2079 |
cond = est_cond_new(); |
2080 |
est_cond_set_phrase(cond, "one | \xe3\x80\x82 | check & check it ! hogehoge"); |
2081 |
est_cond_add_attr(cond, "@uri STRBW file://"); |
2082 |
est_cond_set_order(cond, "@title STRD"); |
2083 |
est_cond_set_options(cond, ESTCONDSURE | ESTCONDNOIDF | ESTCONDSIMPLE); |
2084 |
res = est_db_search(db, cond, &rnum, NULL); |
2085 |
if(rnum != 4){ |
2086 |
printferror("%s: the number of result is invalid", dbname); |
2087 |
err = TRUE; |
2088 |
} |
2089 |
free(res); |
2090 |
est_cond_delete(cond); |
2091 |
} |
2092 |
if(!err){ |
2093 |
printfinfo("# checking optimization"); |
2094 |
if(!est_db_optimize(db, 0)) err = TRUE; |
2095 |
cond = est_cond_new(); |
2096 |
est_cond_set_phrase(cond, "check"); |
2097 |
res = est_db_search(db, cond, &rnum, NULL); |
2098 |
if(rnum != 4){ |
2099 |
printferror("%s: the number of result is invalid", dbname); |
2100 |
err = TRUE; |
2101 |
} |
2102 |
free(res); |
2103 |
est_cond_delete(cond); |
2104 |
} |
2105 |
if(!err){ |
2106 |
printfinfo("# checking traversal access"); |
2107 |
cond = est_cond_new(); |
2108 |
est_cond_set_phrase(cond, "[UVSET]"); |
2109 |
res = est_db_search(db, cond, &rnum, NULL); |
2110 |
for(i = 0; i < rnum; i++){ |
2111 |
if(!(doc = est_db_get_doc(db, res[i], 0))){ |
2112 |
printferror("%s: a document cannot be retrieved", dbname); |
2113 |
err = TRUE; |
2114 |
break; |
2115 |
} |
2116 |
est_doc_delete(doc); |
2117 |
} |
2118 |
free(res); |
2119 |
est_cond_delete(cond); |
2120 |
} |
2121 |
if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db))); |
2122 |
printfinfo("# closing the database"); |
2123 |
if(!est_db_close(db, &ecode)){ |
2124 |
printferror("%s: %s", dbname, est_err_msg(ecode)); |
2125 |
return 1; |
2126 |
} |
2127 |
curtime = time(NULL) - curtime; |
2128 |
if(!err) printfinfo("# finished successfully: elapsed time: %dh %dm %ds", |
2129 |
(int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60)); |
2130 |
return err ? 1 : 0; |
2131 |
} |
2132 |
|
2133 |
|
2134 |
/* output escaped string */ |
2135 |
static void xmlprintf(const char *format, ...){ |
2136 |
va_list ap; |
2137 |
char *tmp, cbuf[32]; |
2138 |
unsigned char c; |
2139 |
int cblen; |
2140 |
va_start(ap, format); |
2141 |
while(*format != '\0'){ |
2142 |
if(*format == '%'){ |
2143 |
cbuf[0] = '%'; |
2144 |
cblen = 1; |
2145 |
format++; |
2146 |
while(strchr("0123456789 .+-", *format) && *format != '\0' && cblen < 31){ |
2147 |
cbuf[cblen++] = *format; |
2148 |
format++; |
2149 |
} |
2150 |
cbuf[cblen++] = *format; |
2151 |
cbuf[cblen] = '\0'; |
2152 |
switch(*format){ |
2153 |
case 's': |
2154 |
tmp = va_arg(ap, char *); |
2155 |
if(!tmp) tmp = "(null)"; |
2156 |
printf(cbuf, tmp); |
2157 |
break; |
2158 |
case 'd': |
2159 |
printf(cbuf, va_arg(ap, int)); |
2160 |
break; |
2161 |
case 'o': case 'u': case 'x': case 'X': case 'c': |
2162 |
printf(cbuf, va_arg(ap, unsigned int)); |
2163 |
break; |
2164 |
case 'e': case 'E': case 'f': case 'g': case 'G': |
2165 |
printf(cbuf, va_arg(ap, double)); |
2166 |
break; |
2167 |
case '@': |
2168 |
tmp = va_arg(ap, char *); |
2169 |
if(!tmp) tmp = "(null)"; |
2170 |
while(*tmp){ |
2171 |
switch(*tmp){ |
2172 |
case '&': printf("&"); break; |
2173 |
case '<': printf("<"); break; |
2174 |
case '>': printf(">"); break; |
2175 |
case '"': printf("""); break; |
2176 |
default: |
2177 |
if(!((*tmp >= 0 && *tmp <= 0x8) || (*tmp >= 0x0e && *tmp <= 0x1f))) putchar(*tmp); |
2178 |
break; |
2179 |
} |
2180 |
tmp++; |
2181 |
} |
2182 |
break; |
2183 |
case '?': |
2184 |
tmp = va_arg(ap, char *); |
2185 |
if(!tmp) tmp = "(null)"; |
2186 |
while(*tmp){ |
2187 |
c = *(unsigned char *)tmp; |
2188 |
if((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || |
2189 |
(c >= '0' && c <= '9') || (c != '\0' && strchr("_-.", c))){ |
2190 |
putchar(c); |
2191 |
} else { |
2192 |
printf("%%%02X", c); |
2193 |
} |
2194 |
tmp++; |
2195 |
} |
2196 |
break; |
2197 |
case '%': |
2198 |
putchar('%'); |
2199 |
break; |
2200 |
} |
2201 |
} else { |
2202 |
putchar(*format); |
2203 |
} |
2204 |
format++; |
2205 |
} |
2206 |
va_end(ap); |
2207 |
} |
2208 |
|
2209 |
|
2210 |
/* get the language value */ |
2211 |
static int strtolang(const char *str){ |
2212 |
if(!cbstricmp(str, "en")) return ESTLANGEN; |
2213 |
if(!cbstricmp(str, "ja")) return ESTLANGJA; |
2214 |
if(!cbstricmp(str, "zh")) return ESTLANGZH; |
2215 |
if(!cbstricmp(str, "ko")) return ESTLANGKO; |
2216 |
return ESTLANGMISC; |
2217 |
} |
2218 |
|
2219 |
|
2220 |
/* read a line */ |
2221 |
static char *fgetl(FILE *ifp){ |
2222 |
char *buf; |
2223 |
int c, len, blen; |
2224 |
buf = NULL; |
2225 |
len = 0; |
2226 |
blen = 1024; |
2227 |
while((c = fgetc(ifp)) != EOF){ |
2228 |
if(blen <= len) blen *= 2; |
2229 |
buf = cbrealloc(buf, blen + 1); |
2230 |
if(c == '\n') c = '\0'; |
2231 |
if(c != '\r') buf[len++] = c; |
2232 |
if(c == '\0') break; |
2233 |
} |
2234 |
if(!buf) return NULL; |
2235 |
buf[len] = '\0'; |
2236 |
return buf; |
2237 |
} |
2238 |
|
2239 |
|
2240 |
/* register a document */ |
2241 |
static int doputdoc(ESTDB *db, const char *path){ |
2242 |
ESTDOC *doc, *edoc; |
2243 |
const char *uri, *vbuf, *xcmd; |
2244 |
char *dbuf, *tbuf; |
2245 |
int err, fmt, id, dsiz; |
2246 |
time_t emdate, fmdate; |
2247 |
struct stat sbuf; |
2248 |
xcmd = NULL; |
2249 |
if(cbmaprnum(g_xcmdmap) > 0){ |
2250 |
cbmapiterinit(g_xcmdmap); |
2251 |
while((vbuf = cbmapiternext(g_xcmdmap, NULL)) != NULL){ |
2252 |
if(cbstrbwimatch(path, vbuf)){ |
2253 |
xcmd = cbmapget(g_xcmdmap, vbuf, -1, NULL); |
2254 |
break; |
2255 |
} |
2256 |
} |
2257 |
} |
2258 |
fmt = g_filefmt; |
2259 |
if(g_filefmt == FF_NONE && !xcmd) return TRUE; |
2260 |
if(g_filefmt == FF_AUTO){ |
2261 |
if(cbstrbwimatch(path, ESTEXTSTR "est")){ |
2262 |
fmt = FF_DRAFT; |
2263 |
} else if(cbstrbwimatch(path, ESTEXTSTR "txt") || cbstrbwimatch(path, ESTEXTSTR "text") || |
2264 |
cbstrbwimatch(path, ESTEXTSTR "asc")){ |
2265 |
fmt = FF_TEXT; |
2266 |
} else if(cbstrbwimatch(path, ESTEXTSTR "html") || cbstrbwimatch(path, ESTEXTSTR "htm") || |
2267 |
cbstrbwimatch(path, ESTEXTSTR "xhtml") || cbstrbwimatch(path, ESTEXTSTR "xht")){ |
2268 |
fmt = FF_HTML; |
2269 |
} else if(cbstrbwimatch(path, ESTEXTSTR "eml") || cbstrbwimatch(path, ESTEXTSTR "mime") || |
2270 |
cbstrbwimatch(path, ESTEXTSTR "mht") || cbstrbwimatch(path, ESTEXTSTR "mhtml")){ |
2271 |
fmt = FF_MIME; |
2272 |
} else if(!xcmd){ |
2273 |
return TRUE; |
2274 |
} |
2275 |
} |
2276 |
if(stat(path, &sbuf) == -1 || !S_ISREG(sbuf.st_mode) || !(uri = pathtourl(path))){ |
2277 |
printferror("%s: could not open", path); |
2278 |
return TRUE; |
2279 |
} |
2280 |
emdate = -1; |
2281 |
if(g_chkmdate && (id = est_db_uri_to_id(db, uri)) > 0 && |
2282 |
(edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){ |
2283 |
if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf); |
2284 |
est_doc_delete(edoc); |
2285 |
} |
2286 |
if(g_stdate && emdate >= 0 && emdate >= sbuf.st_mtime){ |
2287 |
printfinfo("%s: passed", path); |
2288 |
return TRUE; |
2289 |
} |
2290 |
if(g_filtorig){ |
2291 |
dbuf = cbmemdup("", 0); |
2292 |
dsiz = 0; |
2293 |
} else { |
2294 |
if(!(dbuf = cbreadfile(path, &dsiz))){ |
2295 |
printferror("%s: could not open", path); |
2296 |
return TRUE; |
2297 |
} |
2298 |
} |
2299 |
if(xcmd){ |
2300 |
doc = est_doc_new_with_xcmd(dbuf, dsiz, path, xcmd, est_db_name(db), |
2301 |
g_inputcode, g_inputlang); |
2302 |
} else { |
2303 |
switch(fmt){ |
2304 |
case FF_TEXT: |
2305 |
doc = est_doc_new_from_text(dbuf, dsiz, g_inputcode, g_inputlang); |
2306 |
break; |
2307 |
case FF_HTML: |
2308 |
doc = est_doc_new_from_html(dbuf, dsiz, g_inputcode, g_inputlang); |
2309 |
break; |
2310 |
case FF_MIME: |
2311 |
doc = est_doc_new_from_mime(dbuf, dsiz, g_inputcode, g_inputlang); |
2312 |
break; |
2313 |
default: |
2314 |
doc = est_doc_new_from_draft_enc(dbuf, dsiz, g_inputcode); |
2315 |
break; |
2316 |
} |
2317 |
} |
2318 |
if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri); |
2319 |
est_doc_add_attr(doc, DATTRLPATH, uri); |
2320 |
est_doc_add_attr(doc, DATTRLFILE, urltofile(uri)); |
2321 |
uri = est_doc_attr(doc, ESTDATTRURI); |
2322 |
if(g_stdate){ |
2323 |
tbuf = cbdatestrwww(sbuf.st_ctime, 0); |
2324 |
est_doc_add_attr(doc, ESTDATTRCDATE, tbuf); |
2325 |
free(tbuf); |
2326 |
tbuf = cbdatestrwww(sbuf.st_mtime, 0); |
2327 |
est_doc_add_attr(doc, ESTDATTRMDATE, tbuf); |
2328 |
free(tbuf); |
2329 |
} |
2330 |
if(g_chkmdate && emdate == -1 && (id = est_db_uri_to_id(db, uri)) > 0 && |
2331 |
(edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){ |
2332 |
if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf); |
2333 |
est_doc_delete(edoc); |
2334 |
} |
2335 |
fmdate = -1; |
2336 |
if(g_chkmdate && (vbuf = est_doc_attr(doc, ESTDATTRMDATE)) != NULL) fmdate = cbstrmktime(vbuf); |
2337 |
err = FALSE; |
2338 |
if(emdate >= 0 && emdate >= fmdate){ |
2339 |
printfinfo("%s: passed", path); |
2340 |
} else if(est_db_put_doc(db, doc, g_putopts)){ |
2341 |
printfinfo("%d (%s): registered", est_doc_id(doc), uri); |
2342 |
} else { |
2343 |
printferror("%s: %s", est_db_name(db), est_err_msg(est_db_error(db))); |
2344 |
err = TRUE; |
2345 |
} |
2346 |
est_doc_delete(doc); |
2347 |
free(dbuf); |
2348 |
return err ? FALSE : TRUE; |
2349 |
} |
2350 |
|
2351 |
|
2352 |
/* get the URL of a path */ |
2353 |
static const char *pathtourl(const char *path){ |
2354 |
static char pbuf[URIBUFSIZ]; |
2355 |
const char *elem; |
2356 |
char *wp, *ebuf; |
2357 |
CBLIST *list; |
2358 |
int i, esiz; |
2359 |
if(strlen(path) >= URIBUFSIZ / 4) return NULL; |
2360 |
if(g_pathcode){ |
2361 |
wp = est_realpath(path); |
2362 |
if(!(ebuf = est_iconv(wp, -1, g_pathcode, "UTF-8", &esiz, NULL))){ |
2363 |
esiz = strlen(wp); |
2364 |
ebuf = cbmemdup(wp, esiz); |
2365 |
} |
2366 |
list = cbsplit(ebuf, esiz, ESTPATHSTR); |
2367 |
free(ebuf); |
2368 |
free(wp); |
2369 |
for(i = 0; i < cblistnum(list); i++){ |
2370 |
elem = cblistval(list, i, &esiz); |
2371 |
if((ebuf = est_iconv(elem, esiz, "UTF-8", g_pathcode, &esiz, NULL)) != NULL){ |
2372 |
cblistover(list, i, ebuf, esiz); |
2373 |
free(ebuf); |
2374 |
} |
2375 |
} |
2376 |
} else { |
2377 |
wp = est_realpath(path); |
2378 |
list = cbsplit(wp, -1, ESTPATHSTR); |
2379 |
free(wp); |
2380 |
} |
2381 |
wp = pbuf; |
2382 |
wp += sprintf(wp, "file://"); |
2383 |
for(i = 0; i < cblistnum(list); i++){ |
2384 |
elem = cblistval(list, i, NULL); |
2385 |
if(elem[0] == '\0') continue; |
2386 |
if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) && |
2387 |
elem[1] == ':'){ |
2388 |
wp += sprintf(wp, "%c|", elem[0]); |
2389 |
continue; |
2390 |
} |
2391 |
ebuf = cburlencode(elem, -1); |
2392 |
wp += sprintf(wp, "/%s", ebuf); |
2393 |
free(ebuf); |
2394 |
} |
2395 |
*wp = '\0'; |
2396 |
cblistclose(list); |
2397 |
return pbuf; |
2398 |
} |
2399 |
|
2400 |
|
2401 |
/* get the file name of a URL */ |
2402 |
static const char *urltofile(const char *uri){ |
2403 |
static char pbuf[URIBUFSIZ]; |
2404 |
const char *rp; |
2405 |
char *dbuf, *ebuf; |
2406 |
int dsiz; |
2407 |
if(g_pathfull){ |
2408 |
if((rp = strstr(uri, "//")) != NULL){ |
2409 |
rp += 2; |
2410 |
if(((rp[0] >= 'A' && rp[0] <= 'Z') || (rp[0] >= 'a' && rp[0] <= 'z')) && |
2411 |
rp[1] == '|' && rp[2] == '/') rp += 2; |
2412 |
} else { |
2413 |
rp = uri; |
2414 |
} |
2415 |
} else if((rp = strrchr(uri, '/')) != NULL){ |
2416 |
rp++; |
2417 |
} else { |
2418 |
rp = uri; |
2419 |
} |
2420 |
dbuf = cburldecode(rp, &dsiz); |
2421 |
if((ebuf = est_iconv(dbuf, dsiz, g_pathcode ? g_pathcode : "ISO-8859-1", "UTF-8", NULL, NULL)) |
2422 |
!= NULL){ |
2423 |
sprintf(pbuf, "%s", ebuf); |
2424 |
free(ebuf); |
2425 |
} else { |
2426 |
sprintf(pbuf, "%s", rp); |
2427 |
} |
2428 |
free(dbuf); |
2429 |
return pbuf; |
2430 |
} |
2431 |
|
2432 |
|
2433 |
/* geth the local path of a URL */ |
2434 |
static char *urltopath(const char *uri){ |
2435 |
static char pbuf[URIBUFSIZ]; |
2436 |
const char *elem; |
2437 |
char *wp, *dbuf; |
2438 |
CBLIST *list; |
2439 |
int i; |
2440 |
if(!cbstrfwimatch(uri, "file://")) return NULL; |
2441 |
if(!(uri = strchr(uri + 7, '/'))) return NULL; |
2442 |
list = cbsplit(uri, -1, "/"); |
2443 |
wp = pbuf; |
2444 |
for(i = 0; i < cblistnum(list); i++){ |
2445 |
elem = cblistval(list, i, NULL); |
2446 |
if(elem[0] == '\0') continue; |
2447 |
if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) && |
2448 |
elem[1] == '|'){ |
2449 |
wp += sprintf(wp, "%c:", elem[0]); |
2450 |
continue; |
2451 |
} |
2452 |
dbuf = cburldecode(elem, NULL); |
2453 |
wp += sprintf(wp, "%c%s", ESTPATHCHR, dbuf); |
2454 |
free(dbuf); |
2455 |
} |
2456 |
*wp = '\0'; |
2457 |
cblistclose(list); |
2458 |
return pbuf; |
2459 |
} |
2460 |
|
2461 |
|
2462 |
/* create a vector of keywords */ |
2463 |
static CBMAP *vectorizer(void *db, int id, void *kwdb){ |
2464 |
CBMAP *kwords; |
2465 |
char *mbuf; |
2466 |
int msiz; |
2467 |
if(!(mbuf = crget((CURIA *)kwdb, (char *)&id, sizeof(int), 0, -1, &msiz))) return NULL; |
2468 |
kwords = cbmapload(mbuf, msiz); |
2469 |
free(mbuf); |
2470 |
return kwords; |
2471 |
} |
2472 |
|
2473 |
|
2474 |
/* create a document object with an outer command */ |
2475 |
static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path, |
2476 |
const char *xcmd, const char *tmpdir, |
2477 |
const char *penc, int plang){ |
2478 |
ESTDOC *doc; |
2479 |
const char *pv, *ext; |
2480 |
char iname[URIBUFSIZ], oname[URIBUFSIZ], ebuf[URIBUFSIZ], cmd[URIBUFSIZ]; |
2481 |
char *rbuf, numbuf[NUMBUFSIZ]; |
2482 |
int fmt, rsiz; |
2483 |
assert(buf && size >= 0 && path && xcmd && tmpdir); |
2484 |
sprintf(ebuf, "ESTORIGFILE=%s", path); |
2485 |
ext = NULL; |
2486 |
if((pv = strrchr(path, ESTPATHCHR)) != NULL) path = pv; |
2487 |
if((pv = strrchr(path, ESTEXTCHR)) != NULL) ext = pv; |
2488 |
if(!ext) ext = ""; |
2489 |
sprintf(iname, "%s%cxcmd-in-%08d%s", tmpdir, ESTPATHCHR, getpid(), ext); |
2490 |
sprintf(oname, "%s%cxcmd-out-%08d%cest", tmpdir, ESTPATHCHR, getpid(), ESTEXTCHR); |
2491 |
fmt = FF_DRAFT; |
2492 |
if(cbstrfwmatch(xcmd, "T@")){ |
2493 |
fmt = FF_TEXT; |
2494 |
xcmd += 2; |
2495 |
} else if(cbstrfwmatch(xcmd, "H@")){ |
2496 |
fmt = FF_HTML; |
2497 |
xcmd += 2; |
2498 |
} else if(cbstrfwmatch(xcmd, "M@")){ |
2499 |
fmt = FF_MIME; |
2500 |
xcmd += 2; |
2501 |
} |
2502 |
sprintf(cmd, "%s %s %s", xcmd, iname, oname); |
2503 |
if(!g_filtorig) cbwritefile(iname, buf, size); |
2504 |
putenv(ebuf); |
2505 |
system(cmd); |
2506 |
if((rbuf = cbreadfile(oname, &rsiz)) != NULL){ |
2507 |
switch(fmt){ |
2508 |
case FF_TEXT: |
2509 |
doc = est_doc_new_from_text(rbuf, rsiz, penc, plang); |
2510 |
break; |
2511 |
case FF_HTML: |
2512 |
doc = est_doc_new_from_html(rbuf, rsiz, penc, plang); |
2513 |
break; |
2514 |
case FF_MIME: |
2515 |
doc = est_doc_new_from_mime(rbuf, rsiz, penc, plang); |
2516 |
break; |
2517 |
default: |
2518 |
doc = est_doc_new_from_draft_enc(rbuf, rsiz, penc); |
2519 |
break; |
2520 |
} |
2521 |
free(rbuf); |
2522 |
} else { |
2523 |
doc = est_doc_new(); |
2524 |
} |
2525 |
if(fmt != FF_DRAFT){ |
2526 |
sprintf(numbuf, "%d", size); |
2527 |
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf); |
2528 |
est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext)); |
2529 |
} |
2530 |
unlink(oname); |
2531 |
unlink(iname); |
2532 |
return doc; |
2533 |
} |
2534 |
|
2535 |
|
2536 |
/* create a document object from draft data in another encoding */ |
2537 |
static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc){ |
2538 |
ESTDOC *doc; |
2539 |
char *rbuf; |
2540 |
assert(buf); |
2541 |
if(enc && (rbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL)) != NULL){ |
2542 |
doc = est_doc_new_from_draft(rbuf); |
2543 |
free(rbuf); |
2544 |
} else { |
2545 |
doc = est_doc_new_from_draft(buf); |
2546 |
} |
2547 |
return doc; |
2548 |
} |
2549 |
|
2550 |
|
2551 |
/* create a document object from plain text */ |
2552 |
static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang){ |
2553 |
ESTDOC *doc; |
2554 |
CBLIST *lines; |
2555 |
CBDATUM *datum; |
2556 |
const char *enc, *text, *line; |
2557 |
char *nbuf, numbuf[NUMBUFSIZ]; |
2558 |
int i; |
2559 |
assert(buf); |
2560 |
doc = est_doc_new(); |
2561 |
enc = penc ? penc : est_enc_name(buf, size, plang); |
2562 |
if(!strcmp(enc, "UTF-8")){ |
2563 |
nbuf = NULL; |
2564 |
text = buf; |
2565 |
} else { |
2566 |
text = buf; |
2567 |
nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL); |
2568 |
if(nbuf) text = nbuf; |
2569 |
} |
2570 |
lines = cbsplit(text, -1, "\n"); |
2571 |
datum = cbdatumopen("", 0); |
2572 |
for(i = 0; i < CB_LISTNUM(lines); i++){ |
2573 |
line = CB_LISTVAL(lines, i, NULL); |
2574 |
while(*line == ' ' || *line == '\t' || *line == '\r'){ |
2575 |
line++; |
2576 |
} |
2577 |
if(line[0] == '\0'){ |
2578 |
est_doc_add_text(doc, CB_DATUMPTR(datum)); |
2579 |
cbdatumsetsize(datum, 0); |
2580 |
} else { |
2581 |
cbdatumcat(datum, " ", 1); |
2582 |
cbdatumcat(datum, line, -1); |
2583 |
} |
2584 |
} |
2585 |
est_doc_add_text(doc, CB_DATUMPTR(datum)); |
2586 |
cbdatumclose(datum); |
2587 |
cblistclose(lines); |
2588 |
est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain"); |
2589 |
sprintf(numbuf, "%d", size); |
2590 |
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf); |
2591 |
if(nbuf) free(nbuf); |
2592 |
return doc; |
2593 |
} |
2594 |
|
2595 |
|
2596 |
/* create a document object from HTML */ |
2597 |
static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang){ |
2598 |
ESTDOC *doc; |
2599 |
CBLIST *elems; |
2600 |
CBMAP *attrs; |
2601 |
CBDATUM *datum; |
2602 |
const char *enc, *html, *elem, *next, *name, *content; |
2603 |
char *nbuf, *nenc, *rbuf, *lbuf, numbuf[NUMBUFSIZ]; |
2604 |
int i, esiz; |
2605 |
assert(buf); |
2606 |
doc = est_doc_new(); |
2607 |
enc = est_enc_name(buf, size, plang); |
2608 |
html = NULL; |
2609 |
nbuf = NULL; |
2610 |
if(!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")){ |
2611 |
nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL); |
2612 |
} else if(!strcmp(enc, "US-ASCII")){ |
2613 |
nbuf = NULL; |
2614 |
} else { |
2615 |
if((nenc = penc ? cbmemdup(penc, -1) : est_html_enc(buf)) != NULL){ |
2616 |
if(cbstricmp(nenc, "UTF-8")){ |
2617 |
nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL); |
2618 |
if(!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL); |
2619 |
} |
2620 |
free(nenc); |
2621 |
} else { |
2622 |
nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL); |
2623 |
} |
2624 |
} |
2625 |
if(nbuf) html = nbuf; |
2626 |
if(!html) html = buf; |
2627 |
datum = cbdatumopen("", 0); |
2628 |
elems = cbxmlbreak(html, TRUE); |
2629 |
for(i = 0; i < CB_LISTNUM(elems); i++){ |
2630 |
elem = CB_LISTVAL2(elems, i, &esiz); |
2631 |
if(!(next = cblistval(elems, i + 1, NULL))) next = ""; |
2632 |
if(elem[0] == '<'){ |
2633 |
if(cbstrfwimatch(elem, "<meta")){ |
2634 |
attrs = cbxmlattrs(elem); |
2635 |
name = cbmapget(attrs, "name", -1, NULL); |
2636 |
if(!name) name = cbmapget(attrs, "Name", -1, NULL); |
2637 |
if(!name) name = cbmapget(attrs, "NAME", -1, NULL); |
2638 |
if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL); |
2639 |
if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL); |
2640 |
if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL); |
2641 |
if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL); |
2642 |
content = cbmapget(attrs, "content", -1, NULL); |
2643 |
if(!content) content = cbmapget(attrs, "Content", -1, NULL); |
2644 |
if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL); |
2645 |
if(name && content){ |
2646 |
lbuf = cbmemdup(name, -1); |
2647 |
cbstrtolower(lbuf); |
2648 |
cbstrsqzspc(lbuf); |
2649 |
if(!strcmp(lbuf, "author")){ |
2650 |
if(strchr(content, '&')){ |
2651 |
rbuf = est_html_raw_text(content); |
2652 |
est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf); |
2653 |
free(rbuf); |
2654 |
} else { |
2655 |
est_doc_add_attr(doc, ESTDATTRAUTHOR, content); |
2656 |
} |
2657 |
} |
2658 |
if(name[0] != '@'){ |
2659 |
if(strchr(content, '&')){ |
2660 |
rbuf = est_html_raw_text(content); |
2661 |
est_doc_add_attr(doc, lbuf, rbuf); |
2662 |
free(rbuf); |
2663 |
} else { |
2664 |
est_doc_add_attr(doc, lbuf, content); |
2665 |
} |
2666 |
} |
2667 |
free(lbuf); |
2668 |
} |
2669 |
cbmapclose(attrs); |
2670 |
} else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){ |
2671 |
if(strchr(next, '&')){ |
2672 |
rbuf = est_html_raw_text(next); |
2673 |
est_doc_add_attr(doc, ESTDATTRTITLE, rbuf); |
2674 |
est_doc_add_hidden_text(doc, rbuf); |
2675 |
free(rbuf); |
2676 |
} else { |
2677 |
est_doc_add_attr(doc, ESTDATTRTITLE, next); |
2678 |
est_doc_add_hidden_text(doc, next); |
2679 |
} |
2680 |
i++; |
2681 |
} else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){ |
2682 |
i++; |
2683 |
} else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") || |
2684 |
cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") || |
2685 |
cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") || |
2686 |
cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") || |
2687 |
cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") || |
2688 |
cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") || |
2689 |
cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") || |
2690 |
cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") || |
2691 |
cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") || |
2692 |
cbstrfwimatch(elem, "<pre")){ |
2693 |
if(strchr(CB_DATUMPTR(datum), '&')){ |
2694 |
rbuf = est_html_raw_text(CB_DATUMPTR(datum)); |
2695 |
est_doc_add_text(doc, rbuf); |
2696 |
free(rbuf); |
2697 |
} else { |
2698 |
est_doc_add_text(doc, CB_DATUMPTR(datum)); |
2699 |
} |
2700 |
cbdatumsetsize(datum, 0); |
2701 |
} |
2702 |
} else { |
2703 |
cbdatumcat(datum, " ", -1); |
2704 |
cbdatumcat(datum, elem, esiz); |
2705 |
} |
2706 |
} |
2707 |
cblistclose(elems); |
2708 |
if(strchr(CB_DATUMPTR(datum), '&')){ |
2709 |
rbuf = est_html_raw_text(CB_DATUMPTR(datum)); |
2710 |
est_doc_add_text(doc, rbuf); |
2711 |
free(rbuf); |
2712 |
} else { |
2713 |
est_doc_add_text(doc, CB_DATUMPTR(datum)); |
2714 |
} |
2715 |
cbdatumclose(datum); |
2716 |
if(nbuf) free(nbuf); |
2717 |
est_doc_add_attr(doc, ESTDATTRTYPE, "text/html"); |
2718 |
sprintf(numbuf, "%d", size); |
2719 |
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf); |
2720 |
return doc; |
2721 |
} |
2722 |
|
2723 |
|
2724 |
/* get the encoding of an HTML string */ |
2725 |
static char *est_html_enc(const char *str){ |
2726 |
CBLIST *elems; |
2727 |
CBMAP *attrs; |
2728 |
const char *elem, *equiv, *content; |
2729 |
char *enc, *pv; |
2730 |
int i; |
2731 |
assert(str); |
2732 |
elems = cbxmlbreak(str, TRUE); |
2733 |
for(i = 0; i < CB_LISTNUM(elems); i++){ |
2734 |
elem = CB_LISTVAL(elems, i, NULL); |
2735 |
if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue; |
2736 |
enc = NULL; |
2737 |
attrs = cbxmlattrs(elem); |
2738 |
equiv = cbmapget(attrs, "http-equiv", -1, NULL); |
2739 |
if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL); |
2740 |
if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL); |
2741 |
if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL); |
2742 |
if(equiv && !cbstricmp(equiv, "Content-Type")){ |
2743 |
content = cbmapget(attrs, "content", -1, NULL); |
2744 |
if(!content) content = cbmapget(attrs, "Content", -1, NULL); |
2745 |
if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL); |
2746 |
if(content && ((pv = strstr(content, "charset")) != NULL || |
2747 |
(pv = strstr(content, "Charset")) != NULL || |
2748 |
(pv = strstr(content, "CHARSET")) != NULL)){ |
2749 |
enc = cbmemdup(pv + 8, -1); |
2750 |
if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL || |
2751 |
(pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0'; |
2752 |
} |
2753 |
} |
2754 |
cbmapclose(attrs); |
2755 |
if(enc){ |
2756 |
cblistclose(elems); |
2757 |
return enc; |
2758 |
} |
2759 |
} |
2760 |
cblistclose(elems); |
2761 |
return NULL; |
2762 |
} |
2763 |
|
2764 |
|
2765 |
/* unescape entity references of HTML */ |
2766 |
static char *est_html_raw_text(const char *html){ |
2767 |
static const char *pairs[] = { |
2768 |
/* basic symbols */ |
2769 |
"&", "&", "<", "<", ">", ">", """, "\"", "'", "'", |
2770 |
/* ISO-8859-1 */ |
2771 |
" ", "\xc2\xa0", "¡", "\xc2\xa1", "¢", "\xc2\xa2", |
2772 |
"£", "\xc2\xa3", "¤", "\xc2\xa4", "¥", "\xc2\xa5", |
2773 |
"¦", "\xc2\xa6", "§", "\xc2\xa7", "¨", "\xc2\xa8", |
2774 |
"©", "\xc2\xa9", "ª", "\xc2\xaa", "«", "\xc2\xab", |
2775 |
"¬", "\xc2\xac", "­", "\xc2\xad", "®", "\xc2\xae", |
2776 |
"¯", "\xc2\xaf", "°", "\xc2\xb0", "±", "\xc2\xb1", |
2777 |
"²", "\xc2\xb2", "³", "\xc2\xb3", "´", "\xc2\xb4", |
2778 |
"µ", "\xc2\xb5", "¶", "\xc2\xb6", "·", "\xc2\xb7", |
2779 |
"¸", "\xc2\xb8", "¹", "\xc2\xb9", "º", "\xc2\xba", |
2780 |
"»", "\xc2\xbb", "¼", "\xc2\xbc", "½", "\xc2\xbd", |
2781 |
"¾", "\xc2\xbe", "¿", "\xc2\xbf", "À", "\xc3\x80", |
2782 |
"Á", "\xc3\x81", "Â", "\xc3\x82", "Ã", "\xc3\x83", |
2783 |
"Ä", "\xc3\x84", "Å", "\xc3\x85", "Æ", "\xc3\x86", |
2784 |
"Ç", "\xc3\x87", "È", "\xc3\x88", "É", "\xc3\x89", |
2785 |
"Ê", "\xc3\x8a", "Ë", "\xc3\x8b", "Ì", "\xc3\x8c", |
2786 |
"Í", "\xc3\x8d", "Î", "\xc3\x8e", "Ï", "\xc3\x8f", |
2787 |
"Ð", "\xc3\x90", "Ñ", "\xc3\x91", "Ò", "\xc3\x92", |
2788 |
"Ó", "\xc3\x93", "Ô", "\xc3\x94", "Õ", "\xc3\x95", |
2789 |
"Ö", "\xc3\x96", "×", "\xc3\x97", "Ø", "\xc3\x98", |
2790 |
"Ù", "\xc3\x99", "Ú", "\xc3\x9a", "Û", "\xc3\x9b", |
2791 |
"Ü", "\xc3\x9c", "Ý", "\xc3\x9d", "Þ", "\xc3\x9e", |
2792 |
"ß", "\xc3\x9f", "à", "\xc3\xa0", "á", "\xc3\xa1", |
2793 |
"â", "\xc3\xa2", "ã", "\xc3\xa3", "ä", "\xc3\xa4", |
2794 |
"å", "\xc3\xa5", "æ", "\xc3\xa6", "ç", "\xc3\xa7", |
2795 |
"è", "\xc3\xa8", "é", "\xc3\xa9", "ê", "\xc3\xaa", |
2796 |
"ë", "\xc3\xab", "ì", "\xc3\xac", "í", "\xc3\xad", |
2797 |
"î", "\xc3\xae", "ï", "\xc3\xaf", "ð", "\xc3\xb0", |
2798 |
"ñ", "\xc3\xb1", "ò", "\xc3\xb2", "ó", "\xc3\xb3", |
2799 |
"ô", "\xc3\xb4", "õ", "\xc3\xb5", "ö", "\xc3\xb6", |
2800 |
"÷", "\xc3\xb7", "ø", "\xc3\xb8", "ù", "\xc3\xb9", |
2801 |
"ú", "\xc3\xba", "û", "\xc3\xbb", "ü", "\xc3\xbc", |
2802 |
"ý", "\xc3\xbd", "þ", "\xc3\xbe", "ÿ", "\xc3\xbf", |
2803 |
/* ISO-10646 */ |
2804 |
"ƒ", "\xc6\x92", "Α", "\xce\x91", "Β", "\xce\x92", |
2805 |
"Γ", "\xce\x93", "Δ", "\xce\x94", "Ε", "\xce\x95", |
2806 |
"Ζ", "\xce\x96", "Η", "\xce\x97", "Θ", "\xce\x98", |
2807 |
"Ι", "\xce\x99", "Κ", "\xce\x9a", "Λ", "\xce\x9b", |
2808 |
"Μ", "\xce\x9c", "Ν", "\xce\x9d", "Ξ", "\xce\x9e", |
2809 |
"Ο", "\xce\x9f", "Π", "\xce\xa0", "Ρ", "\xce\xa1", |
2810 |
"Σ", "\xce\xa3", "Τ", "\xce\xa4", "Υ", "\xce\xa5", |
2811 |
"Φ", "\xce\xa6", "Χ", "\xce\xa7", "Ψ", "\xce\xa8", |
2812 |
"Ω", "\xce\xa9", "α", "\xce\xb1", "β", "\xce\xb2", |
2813 |
"γ", "\xce\xb3", "δ", "\xce\xb4", "ε", "\xce\xb5", |
2814 |
"ζ", "\xce\xb6", "η", "\xce\xb7", "θ", "\xce\xb8", |
2815 |
"ι", "\xce\xb9", "κ", "\xce\xba", "λ", "\xce\xbb", |
2816 |
"μ", "\xce\xbc", "ν", "\xce\xbd", "ξ", "\xce\xbe", |
2817 |
"ο", "\xce\xbf", "π", "\xcf\x80", "ρ", "\xcf\x81", |
2818 |
"ς", "\xcf\x82", "σ", "\xcf\x83", "τ", "\xcf\x84", |
2819 |
"υ", "\xcf\x85", "φ", "\xcf\x86", "χ", "\xcf\x87", |
2820 |
"ψ", "\xcf\x88", "ω", "\xcf\x89", "ϑ", "\xcf\x91", |
2821 |
"ϒ", "\xcf\x92", "ϖ", "\xcf\x96", "•", "\xe2\x80\xa2", |
2822 |
"…", "\xe2\x80\xa6", "′", "\xe2\x80\xb2", "″", "\xe2\x80\xb3", |
2823 |
"‾", "\xe2\x80\xbe", "⁄", "\xe2\x81\x84", "℘", "\xe2\x84\x98", |
2824 |
"ℑ", "\xe2\x84\x91", "ℜ", "\xe2\x84\x9c", "™", "\xe2\x84\xa2", |
2825 |
"ℵ", "\xe2\x84\xb5", "←", "\xe2\x86\x90", "↑", "\xe2\x86\x91", |
2826 |
"→", "\xe2\x86\x92", "↓", "\xe2\x86\x93", "↔", "\xe2\x86\x94", |
2827 |
"↵", "\xe2\x86\xb5", "⇐", "\xe2\x87\x90", "⇑", "\xe2\x87\x91", |
2828 |
"⇒", "\xe2\x87\x92", "⇓", "\xe2\x87\x93", "⇔", "\xe2\x87\x94", |
2829 |
"∀", "\xe2\x88\x80", "∂", "\xe2\x88\x82", "∃", "\xe2\x88\x83", |
2830 |
"∅", "\xe2\x88\x85", "∇", "\xe2\x88\x87", "∈", "\xe2\x88\x88", |
2831 |
"∉", "\xe2\x88\x89", "∋", "\xe2\x88\x8b", "∏", "\xe2\x88\x8f", |
2832 |
"∑", "\xe2\x88\x91", "−", "\xe2\x88\x92", "∗", "\xe2\x88\x97", |
2833 |
"√", "\xe2\x88\x9a", "∝", "\xe2\x88\x9d", "∞", "\xe2\x88\x9e", |
2834 |
"∠", "\xe2\x88\xa0", "∧", "\xe2\x88\xa7", "∨", "\xe2\x88\xa8", |
2835 |
"∩", "\xe2\x88\xa9", "∪", "\xe2\x88\xaa", "∫", "\xe2\x88\xab", |
2836 |
"∴", "\xe2\x88\xb4", "∼", "\xe2\x88\xbc", "≅", "\xe2\x89\x85", |
2837 |
"≈", "\xe2\x89\x88", "≠", "\xe2\x89\xa0", "≡", "\xe2\x89\xa1", |
2838 |
"≤", "\xe2\x89\xa4", "≥", "\xe2\x89\xa5", "⊂", "\xe2\x8a\x82", |
2839 |
"⊃", "\xe2\x8a\x83", "⊄", "\xe2\x8a\x84", "⊆", "\xe2\x8a\x86", |
2840 |
"⊇", "\xe2\x8a\x87", "⊕", "\xe2\x8a\x95", "⊗", "\xe2\x8a\x97", |
2841 |
"⊥", "\xe2\x8a\xa5", "⋅", "\xe2\x8b\x85", "⌈", "\xe2\x8c\x88", |
2842 |
"⌉", "\xe2\x8c\x89", "⌊", "\xe2\x8c\x8a", "⌋", "\xe2\x8c\x8b", |
2843 |
"⟨", "\xe2\x8c\xa9", "⟩", "\xe2\x8c\xaa", "◊", "\xe2\x97\x8a", |
2844 |
"♠", "\xe2\x99\xa0", "♣", "\xe2\x99\xa3", "♥", "\xe2\x99\xa5", |
2845 |
"♦", "\xe2\x99\xa6", "Œ", "\xc5\x92", "œ", "\xc5\x93", |
2846 |
"Š", "\xc5\xa0", "š", "\xc5\xa1", "Ÿ", "\xc5\xb8", |
2847 |
"ˆ", "\xcb\x86", "˜", "\xcb\x9c", " ", "\xe2\x80\x82", |
2848 |
" ", "\xe2\x80\x83", " ", "\xe2\x80\x89", "‌", "\xe2\x80\x8c", |
2849 |
"‍", "\xe2\x80\x8d", "‎", "\xe2\x80\x8e", "‏", "\xe2\x80\x8f", |
2850 |
"–", "\xe2\x80\x93", "—", "\xe2\x80\x94", "‘", "\xe2\x80\x98", |
2851 |
"’", "\xe2\x80\x99", "‚", "\xe2\x80\x9a", "“", "\xe2\x80\x9c", |
2852 |
"”", "\xe2\x80\x9d", "„", "\xe2\x80\x9e", "†", "\xe2\x80\xa0", |
2853 |
"‡", "\xe2\x80\xa1", "‰", "\xe2\x80\xb0", "‹", "\xe2\x80\xb9", |
2854 |
"›", "\xe2\x80\xba", "€", "\xe2\x82\xac", |
2855 |
NULL |
2856 |
}; |
2857 |
char *raw, *wp, buf[2], *tmp; |
2858 |
int i, j, hit, num, tsiz; |
2859 |
assert(html); |
2860 |
CB_MALLOC(raw, strlen(html) * 3 + 1); |
2861 |
wp = raw; |
2862 |
while(*html != '\0'){ |
2863 |
if(*html == '&'){ |
2864 |
if(*(html + 1) == '#'){ |
2865 |
if(*(html + 2) == 'x' || *(html + 2) == 'X'){ |
2866 |
num = strtol(html + 3, NULL, 16); |
2867 |
} else { |
2868 |
num = atoi(html + 2); |
2869 |
} |
2870 |
buf[0] = num / 256; |
2871 |
buf[1] = num % 256; |
2872 |
if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){ |
2873 |
for(j = 0; j < tsiz; j++){ |
2874 |
*wp = ((unsigned char *)tmp)[j]; |
2875 |
wp++; |
2876 |
} |
2877 |
free(tmp); |
2878 |
} |
2879 |
while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){ |
2880 |
html++; |
2881 |
} |
2882 |
if(*html == ';') html++; |
2883 |
} else { |
2884 |
hit = FALSE; |
2885 |
for(i = 0; pairs[i] != NULL; i += 2){ |
2886 |
if(cbstrfwmatch(html, pairs[i])){ |
2887 |
wp += sprintf(wp, "%s", pairs[i+1]); |
2888 |
html += strlen(pairs[i]); |
2889 |
hit = TRUE; |
2890 |
break; |
2891 |
} |
2892 |
} |
2893 |
if(!hit){ |
2894 |
*wp = *html; |
2895 |
wp++; |
2896 |
html++; |
2897 |
} |
2898 |
} |
2899 |
} else { |
2900 |
*wp = *html; |
2901 |
wp++; |
2902 |
html++; |
2903 |
} |
2904 |
} |
2905 |
*wp = '\0'; |
2906 |
return raw; |
2907 |
} |
2908 |
|
2909 |
|
2910 |
/* create a document object from MIME */ |
2911 |
static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang){ |
2912 |
ESTDOC *doc, *tdoc; |
2913 |
CBMAP *attrs; |
2914 |
const CBLIST *texts; |
2915 |
CBLIST *parts, *lines; |
2916 |
CBDATUM *datum; |
2917 |
const char *key, *val, *bound, *part, *text, *line; |
2918 |
char *body, *swap, numbuf[NUMBUFSIZ]; |
2919 |
int i, j, bsiz, psiz, ssiz, mht; |
2920 |
assert(buf); |
2921 |
doc = est_doc_new(); |
2922 |
attrs = cbmapopenex(MINIBNUM); |
2923 |
body = cbmimebreak(buf, size, attrs, &bsiz); |
2924 |
if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){ |
2925 |
est_doc_add_attr_mime(doc, ESTDATTRTITLE, val); |
2926 |
if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val); |
2927 |
} |
2928 |
if((val = cbmapget(attrs, "from", -1, NULL)) != NULL) |
2929 |
est_doc_add_attr_mime(doc, ESTDATTRAUTHOR, val); |
2930 |
if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){ |
2931 |
est_doc_add_attr_mime(doc, ESTDATTRCDATE, val); |
2932 |
est_doc_add_attr_mime(doc, ESTDATTRMDATE, val); |
2933 |
} |
2934 |
est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822"); |
2935 |
sprintf(numbuf, "%d", size); |
2936 |
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf); |
2937 |
cbmapiterinit(attrs); |
2938 |
while((key = cbmapiternext(attrs, NULL)) != NULL){ |
2939 |
if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@') continue; |
2940 |
val = cbmapget(attrs, key, -1, NULL); |
2941 |
est_doc_add_attr_mime(doc, key, val); |
2942 |
} |
2943 |
if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){ |
2944 |
mht = cbstrfwimatch(key, "multipart/related"); |
2945 |
if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){ |
2946 |
parts = cbmimeparts(body, bsiz, bound); |
2947 |
for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){ |
2948 |
part = CB_LISTVAL2(parts, i, &psiz); |
2949 |
tdoc = est_doc_new_from_mime(part, psiz, penc, plang); |
2950 |
if(mht){ |
2951 |
if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL) |
2952 |
est_doc_add_attr(doc, ESTDATTRTITLE, text); |
2953 |
if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL) |
2954 |
est_doc_add_attr(doc, ESTDATTRAUTHOR, text); |
2955 |
} |
2956 |
texts = est_doc_texts(tdoc); |
2957 |
for(j = 0; j < CB_LISTNUM(texts); j++){ |
2958 |
text = CB_LISTVAL(texts, j, NULL); |
2959 |
est_doc_add_text(doc, text); |
2960 |
} |
2961 |
est_doc_delete(tdoc); |
2962 |
} |
2963 |
cblistclose(parts); |
2964 |
} |
2965 |
} else { |
2966 |
if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL && |
2967 |
cbstrfwimatch(key, "base64")){ |
2968 |
swap = cbbasedecode(body, &ssiz); |
2969 |
free(body); |
2970 |
body = swap; |
2971 |
bsiz = ssiz; |
2972 |
} else if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL && |
2973 |
cbstrfwimatch(key, "quoted-printable")){ |
2974 |
swap = cbquotedecode(body, &ssiz); |
2975 |
free(body); |
2976 |
body = swap; |
2977 |
bsiz = ssiz; |
2978 |
} |
2979 |
if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){ |
2980 |
if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){ |
2981 |
free(body); |
2982 |
body = swap; |
2983 |
bsiz = ssiz; |
2984 |
} else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL && |
2985 |
(swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){ |
2986 |
free(body); |
2987 |
body = swap; |
2988 |
bsiz = ssiz; |
2989 |
} |
2990 |
lines = cbsplit(body, bsiz, "\n"); |
2991 |
datum = cbdatumopen("", 0); |
2992 |
for(i = 0; i < CB_LISTNUM(lines); i++){ |
2993 |
line = CB_LISTVAL(lines, i, NULL); |
2994 |
while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){ |
2995 |
line++; |
2996 |
} |
2997 |
if(line[0] == '\0'){ |
2998 |
est_doc_add_text(doc, CB_DATUMPTR(datum)); |
2999 |
cbdatumsetsize(datum, 0); |
3000 |
} else { |
3001 |
cbdatumcat(datum, " ", 1); |
3002 |
cbdatumcat(datum, line, -1); |
3003 |
} |
3004 |
} |
3005 |
est_doc_add_text(doc, CB_DATUMPTR(datum)); |
3006 |
cbdatumclose(datum); |
3007 |
cblistclose(lines); |
3008 |
} else if(cbstrfwimatch(key, "text/html")){ |
3009 |
tdoc = est_doc_new_from_html(body, bsiz, penc, plang); |
3010 |
if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){ |
3011 |
if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text); |
3012 |
est_doc_add_text(doc, text); |
3013 |
} |
3014 |
if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){ |
3015 |
if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text); |
3016 |
est_doc_add_text(doc, text); |
3017 |
} |
3018 |
texts = est_doc_texts(tdoc); |
3019 |
for(i = 0; i < CB_LISTNUM(texts); i++){ |
3020 |
text = CB_LISTVAL(texts, i, NULL); |
3021 |
est_doc_add_text(doc, text); |
3022 |
} |
3023 |
est_doc_delete(tdoc); |
3024 |
} else if(cbstrfwimatch(key, "message/rfc822")){ |
3025 |
tdoc = est_doc_new_from_mime(body, bsiz, penc, plang); |
3026 |
if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){ |
3027 |
if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text); |
3028 |
est_doc_add_text(doc, text); |
3029 |
} |
3030 |
if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){ |
3031 |
if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text); |
3032 |
est_doc_add_text(doc, text); |
3033 |
} |
3034 |
texts = est_doc_texts(tdoc); |
3035 |
for(i = 0; i < CB_LISTNUM(texts); i++){ |
3036 |
text = CB_LISTVAL(texts, i, NULL); |
3037 |
est_doc_add_text(doc, text); |
3038 |
} |
3039 |
est_doc_delete(tdoc); |
3040 |
} else if(cbstrfwimatch(key, "text/")){ |
3041 |
tdoc = est_doc_new_from_text(body, bsiz, penc, plang); |
3042 |
texts = est_doc_texts(tdoc); |
3043 |
for(i = 0; i < CB_LISTNUM(texts); i++){ |
3044 |
text = CB_LISTVAL(texts, i, NULL); |
3045 |
est_doc_add_text(doc, text); |
3046 |
} |
3047 |
est_doc_delete(tdoc); |
3048 |
} |
3049 |
} |
3050 |
free(body); |
3051 |
cbmapclose(attrs); |
3052 |
return doc; |
3053 |
} |
3054 |
|
3055 |
|
3056 |
/* set mime value as an attribute of a document */ |
3057 |
static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){ |
3058 |
char enc[64], *ebuf, *rbuf; |
3059 |
assert(doc && name && value); |
3060 |
ebuf = cbmimedecode(value, enc); |
3061 |
if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){ |
3062 |
est_doc_add_attr(doc, name, rbuf); |
3063 |
free(rbuf); |
3064 |
} |
3065 |
free(ebuf); |
3066 |
} |
3067 |
|
3068 |
|
3069 |
/* generate a document with random text */ |
3070 |
static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode){ |
3071 |
ESTDOC *doc; |
3072 |
char *str; |
3073 |
int i; |
3074 |
doc = est_doc_new(); |
3075 |
snum *= pow(est_random_nd() + 0.5, 3.0); |
3076 |
if(mode == RD_RAND){ |
3077 |
mode = est_random() * 100; |
3078 |
if(mode < 20){ |
3079 |
mode = RD_ENG; |
3080 |
est_doc_add_attr(doc, "mode", "english"); |
3081 |
} else if(mode < 40){ |
3082 |
mode = RD_LAT; |
3083 |
est_doc_add_attr(doc, "mode", "latin"); |
3084 |
} else if(mode < 60){ |
3085 |
mode = RD_EURO; |
3086 |
est_doc_add_attr(doc, "mode", "euromix"); |
3087 |
} else if(mode < 65){ |
3088 |
mode = RD_ORI; |
3089 |
est_doc_add_attr(doc, "mode", "oriental"); |
3090 |
} else if(mode < 95){ |
3091 |
mode = RD_JPN; |
3092 |
est_doc_add_attr(doc, "mode", "japanese"); |
3093 |
} else { |
3094 |
mode = RD_CHAO; |
3095 |
est_doc_add_attr(doc, "mode", "chaos"); |
3096 |
} |
3097 |
} |
3098 |
switch(mode){ |
3099 |
case RD_ENG: est_doc_add_attr(doc, "mode", "english"); break; |
3100 |
case RD_LAT: est_doc_add_attr(doc, "mode", "latin"); break; |
3101 |
case RD_ORI: est_doc_add_attr(doc, "mode", "oriental"); break; |
3102 |
case RD_JPN: est_doc_add_attr(doc, "mode", "japanese"); break; |
3103 |
case RD_EURO: est_doc_add_attr(doc, "mode", "euromix"); break; |
3104 |
case RD_CHAO: est_doc_add_attr(doc, "mode", "chaos"); break; |
3105 |
} |
3106 |
for(i = 0; i <= snum; i++){ |
3107 |
str = est_random_str(cnum, mode); |
3108 |
if(est_random() < 0.05){ |
3109 |
est_doc_add_hidden_text(doc, str); |
3110 |
} else { |
3111 |
est_doc_add_text(doc, str); |
3112 |
} |
3113 |
free(str); |
3114 |
} |
3115 |
return doc; |
3116 |
} |
3117 |
|
3118 |
|
3119 |
/* generate random string */ |
3120 |
static char *est_random_str(int cnum, int mode){ |
3121 |
const char echrs[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
3122 |
CBDATUM *buf; |
3123 |
char wc[2], *str; |
3124 |
int i, c, wlen, dec, mm, big, n; |
3125 |
buf = cbdatumopen("", 0); |
3126 |
cnum *= pow(est_random_nd() + 0.5, 3.0); |
3127 |
wlen = est_random_nd() * 8 + 4; |
3128 |
dec = (int)(est_random() * INT_MAX) % 10; |
3129 |
big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100; |
3130 |
for(i = 0; i < cnum; i++){ |
3131 |
switch(mode){ |
3132 |
case RD_ENG: case RD_LAT: case RD_EURO: |
3133 |
mm = (int)(est_random() * INT_MAX) % 100; |
3134 |
if((mode == RD_LAT || mode == RD_EURO) && mm < 5){ |
3135 |
c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0)); |
3136 |
} else if(mode == RD_EURO && (mm < 30 || dec > 8)){ |
3137 |
if(dec % 2 == 0){ |
3138 |
c = 0x0391 + (int)(pow(est_random_nd(), 2.0) * (0x03d6 - 0x0391)); |
3139 |
} else { |
3140 |
c = 0x0400 + (int)(pow(est_random_nd(), 2.0) * (0x045f - 0x0400)); |
3141 |
} |
3142 |
} else if(mm < 95){ |
3143 |
if((n = est_random_nd() * (sizeof(echrs) - 1)) == (sizeof(echrs) - 1)) n = 0; |
3144 |
c = echrs[n]; |
3145 |
} else { |
3146 |
c = (int)(est_random() * ('@' - ' ')) + ' '; |
3147 |
} |
3148 |
if(--wlen < 1){ |
3149 |
c = ' '; |
3150 |
wlen = pow(est_random_nd(), 3.0) * 8 + 4; |
3151 |
dec = (int)(est_random() * INT_MAX) % 10; |
3152 |
} |
3153 |
break; |
3154 |
case RD_ORI: |
3155 |
c = big + est_random_nd() * 0x100; |
3156 |
if(--wlen < 1){ |
3157 |
wlen = pow(est_random_nd(), 3.0) * 12 + 6; |
3158 |
big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100; |
3159 |
} |
3160 |
break; |
3161 |
case RD_JPN: |
3162 |
if(dec < 4){ |
3163 |
c = 0x3041 + pow(est_random_nd(), 3.0) * (0x3094 - 0x3041); |
3164 |
} else if(dec < 7){ |
3165 |
c = 0x30a1 + pow(est_random_nd(), 3.0) * (0x30fe - 0x30a1); |
3166 |
} else if(dec < 9){ |
3167 |
c = 0x4e00 + pow(est_random_nd(), 3.0) * (0x9faf - 0x4e00); |
3168 |
} else { |
3169 |
if(est_random() < 0.7){ |
3170 |
c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0)); |
3171 |
} else { |
3172 |
c = 0x3041 + est_random() * (0xffef - 0x3041); |
3173 |
} |
3174 |
} |
3175 |
if(--wlen < 1){ |
3176 |
wlen = pow(est_random_nd(), 3.0) * 12 + 6; |
3177 |
dec = (int)(est_random() * INT_MAX) % 10; |
3178 |
} |
3179 |
break; |
3180 |
default: |
3181 |
if(est_random() < 0.2){ |
3182 |
c = 0x00a1 + (int)est_random() * (0x00ff - 0x00a0); |
3183 |
} else { |
3184 |
c = (int)(est_random() * 0x10000); |
3185 |
} |
3186 |
break; |
3187 |
} |
3188 |
if(c <= 0 || c >= 0x10000) c = 0x0020; |
3189 |
wc[0] = c / 0x100; |
3190 |
wc[1] = c % 0x100; |
3191 |
cbdatumcat(buf, wc, 2); |
3192 |
} |
3193 |
str = est_iconv(CB_DATUMPTR(buf), CB_DATUMSIZE(buf), "UTF-16BE", "UTF-8", NULL, NULL); |
3194 |
cbdatumclose(buf); |
3195 |
return str; |
3196 |
} |
3197 |
|
3198 |
|
3199 |
|
3200 |
/* END OF FILE */ |