--- trunk/bfilter.pl 2004/09/07 08:33:53 1 +++ trunk/bfilter.pl 2004/09/10 12:16:21 10 @@ -7,7 +7,14 @@ # maximum entries my $max = 0; # minimum letters to search by -my $min_len = 2; +my $min_len = 3; +# if more than x elements, warn to increase min_len +my $increase_at = 500; + +# name of generated index +my $headlines = 'headlines'; + +my $debug = 1; sub print_file { my $f = shift || return; @@ -19,21 +26,38 @@ } print qq{ -var headlines = Array(); +var $headlines = new Object(); }; my @part_arr; my $last_part = ''; my $total = 0; +my $max_elements = 0; + while() { chomp; - # escape single quote - s/'/\\'/g; + if (!m/\t/ || m/\t$/) { + print STDERR "SKIP '$_': no tab\n"; + next; + } + + my ($path,$headline) = split(/\t+/,$_,2); + + if (length($headline) < $min_len) { + print STDERR "SKIP '$_': too short\n"; + next; + } + # split into min_len part and rest - my ($part,$rest) = ( substr($_,0,$min_len), substr($_,$min_len) ); + my ($part,$rest) = ( substr($headline,0,$min_len), substr($headline,$min_len) ); + + # escape special chars + $part =~ s/(['\\])/\\$1/g && print STDERR "ESCAPED part '$part'\n"; + $rest =~ s/(['\\])/\\$1/g && print STDERR "ESCAPED rest '$rest'\n"; + $headline =~ s/(['\\])/\\$1/g; # make part lowercase $part = lc($part); @@ -42,17 +66,25 @@ # new part? if ($part ne $last_part) { - print "headlines['$last_part'] = Array(\n ",join(",\n ",@part_arr),");\n" if (@part_arr); + print STDERR $last_part,"\t",$#part_arr+1,"\n" if ($debug && $#part_arr > $increase_at); + $max_elements = $#part_arr if ($#part_arr > $max_elements); + print "${headlines}['$last_part'] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr); $total += $#part_arr; @part_arr = (); $last_part = $part; } - push @part_arr, "'$_'"; + push @part_arr, "['$path','$headline']"; # break out? last if ($max && $total > $max); } -print "headlines['$last_part'] = Array(\n ",join(",\n ",@part_arr),");\n" if (@part_arr); -print "var min_len = $min_len;\n"; -print "// index elements: $total\n"; +print "${headlines}['$last_part'] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr); +print qq{ + +${headlines}.min_len = $min_len; +${headlines}.length = $total; + +}; + +print STDERR "You have more than $increase_at elements, so you should\nincrease min_len to ",$min_len+1," or higher for performance benefit.\n" if ($max_elements > $increase_at);