1 |
dpavlin |
237 |
#!/usr/bin/perl |
2 |
|
|
|
3 |
|
|
# read files for fulltext index |
4 |
|
|
# filelist is on stdin (e.g. by find) |
5 |
|
|
# index entries go to stdout |
6 |
|
|
|
7 |
|
|
# usage: |
8 |
|
|
# find /foo -name \*.html | ./fulltext >/tmp/idx 2>/tmp/mst |
9 |
|
|
# find /usr/share/doc -type f -a \! -name \*.htm\* | ./fulltext >/tmp/idx 2>/tmp/mst |
10 |
|
|
# sort -o /tmp/idx /tmp/idx |
11 |
|
|
# time ./openisis -write db/test/ft -stream -fmt mfn </tmp/mst |
12 |
|
|
# time ./openisis -db db/test/ft -ifload 0 -v i </tmp/idx |
13 |
|
|
# time ./openisis -db db/test/ft -ifchk -v i |
14 |
|
|
# time ./openisis -db db/test/ft -search Descriptive -ifdump |
15 |
|
|
|
16 |
|
|
$fn = 0; |
17 |
|
|
while (<>) { |
18 |
|
|
chomp; |
19 |
|
|
$f = $_; |
20 |
|
|
next unless open( F, $f ); |
21 |
|
|
$fn++; |
22 |
|
|
print STDERR "100\t$f\n"; |
23 |
|
|
$line = 0; |
24 |
|
|
while ( <F> ) { |
25 |
|
|
next if /^\s*$/; |
26 |
|
|
last if 255 < ++$line; |
27 |
|
|
chomp; |
28 |
|
|
$w = 0; |
29 |
|
|
for $word (split /\W+/) { |
30 |
|
|
next unless $word; |
31 |
|
|
printf "%s\t%d\t%d\t%d\t%d\n", uc($word), $fn, 800, $line, ++$w; |
32 |
|
|
} |
33 |
|
|
} |
34 |
|
|
print STDERR "\f\n"; |
35 |
|
|
} |