1 |
#!/usr/bin/perl |
2 |
|
3 |
# read files for fulltext index |
4 |
# filelist is on stdin (e.g. by find) |
5 |
# index entries go to stdout |
6 |
|
7 |
# usage: |
8 |
# find /foo -name \*.html | ./fulltext >/tmp/idx 2>/tmp/mst |
9 |
# find /usr/share/doc -type f -a \! -name \*.htm\* | ./fulltext >/tmp/idx 2>/tmp/mst |
10 |
# sort -o /tmp/idx /tmp/idx |
11 |
# time ./openisis -write db/test/ft -stream -fmt mfn </tmp/mst |
12 |
# time ./openisis -db db/test/ft -ifload 0 -v i </tmp/idx |
13 |
# time ./openisis -db db/test/ft -ifchk -v i |
14 |
# time ./openisis -db db/test/ft -search Descriptive -ifdump |
15 |
|
16 |
$fn = 0; |
17 |
while (<>) { |
18 |
chomp; |
19 |
$f = $_; |
20 |
next unless open( F, $f ); |
21 |
$fn++; |
22 |
print STDERR "100\t$f\n"; |
23 |
$line = 0; |
24 |
while ( <F> ) { |
25 |
next if /^\s*$/; |
26 |
last if 255 < ++$line; |
27 |
chomp; |
28 |
$w = 0; |
29 |
for $word (split /\W+/) { |
30 |
next unless $word; |
31 |
printf "%s\t%d\t%d\t%d\t%d\n", uc($word), $fn, 800, $line, ++$w; |
32 |
} |
33 |
} |
34 |
print STDERR "\f\n"; |
35 |
} |