Contents of /trunk2/openisis/fulltext

#!/usr/bin/perl

#       read files for fulltext index
#       filelist is on stdin (e.g. by find)
#       index entries go to stdout

#       usage:
# find /foo -name \*.html | ./fulltext >/tmp/idx 2>/tmp/mst
# find /usr/share/doc -type f -a \! -name \*.htm\* | ./fulltext >/tmp/idx 2>/tmp/mst
#       sort -o /tmp/idx /tmp/idx
# time ./openisis -write db/test/ft -stream -fmt mfn </tmp/mst
# time ./openisis -db db/test/ft -ifload 0 -v i </tmp/idx
# time ./openisis -db db/test/ft -ifchk -v i
# time ./openisis -db db/test/ft -search Descriptive -ifdump

$fn = 0;
while (<>) {
        chomp;
        $f = $_;
        next unless open( F, $f );
        $fn++;
        print STDERR "100\t$f\n";
        $line = 0;
        while ( <F> ) {
                next if /^\s*$/;
                last if 255 < ++$line;
                chomp;
                $w = 0;
                for $word (split /\W+/) {
                        next unless $word;
                        printf "%s\t%d\t%d\t%d\t%d\n", uc($word), $fn, 800, $line, ++$w;
                }
        }
        print STDERR "\f\n";
}
1	#!/usr/bin/perl
2
3	# read files for fulltext index
4	# filelist is on stdin (e.g. by find)
5	# index entries go to stdout
6
7	# usage:
8	# find /foo -name \*.html \| ./fulltext >/tmp/idx 2>/tmp/mst
9	# find /usr/share/doc -type f -a \! -name \.htm\ \| ./fulltext >/tmp/idx 2>/tmp/mst
10	# sort -o /tmp/idx /tmp/idx
11	# time ./openisis -write db/test/ft -stream -fmt mfn </tmp/mst
12	# time ./openisis -db db/test/ft -ifload 0 -v i </tmp/idx
13	# time ./openisis -db db/test/ft -ifchk -v i
14	# time ./openisis -db db/test/ft -search Descriptive -ifdump
15
16	$fn = 0;
17	while (<>) {
18	chomp;
19	$f = $_;
20	next unless open( F, $f );
21	$fn++;
22	print STDERR "100\t$f\n";
23	$line = 0;
24	while ( <F> ) {
25	next if /^\s*$/;
26	last if 255 < ++$line;
27	chomp;
28	$w = 0;
29	for $word (split /\W+/) {
30	next unless $word;
31	printf "%s\t%d\t%d\t%d\t%d\n", uc($word), $fn, 800, $line, ++$w;
32	}
33	}
34	print STDERR "\f\n";
35	}