bfilter/trunk/bfilter.pl

#!/usr/bin/perl -w
#

use strict;
use locale;

# maximum entries
my $max = 0;
# minimum letters to search by
my $min_len = shift @ARGV;
$min_len = 3 unless defined($min_len);
# if more than x elements, warn to increase min_len
my $increase_at = 500;

# name of generated index
my $headlines = 'headlines';

my $debug = 1;

sub print_file {
        my $f = shift || return;
        open(F, $f) || die "$f: $!";
        while(<F>) {
                print;
        }
        close(F);
}

print qq{
var $headlines = new Object();
};

my @part_arr;
my $last_part = '';
my $total = 0;

my $max_elements = 0;

sub escape_js {
        my $t = shift || return 'undef';
        # escape single quote and backspace
        $t =~ s/(['\\])/\\$1/g && print STDERR "ESCAPED '$t'\n";
        # quote string if not number
        $t = "'$t'" unless ($t =~ m/^\d+$/);
        return $t;
}

my @lines;

while(<STDIN>) {
        chomp;

        if (!m/\t/ || m/\t$/) {
                print STDERR "SKIP '$_': no tab\n";
                next;
        }

        # remove leading spaces (which are ignored if source list was
        # sorted using locale)
        s/^\s+//;

        push @lines, $_;
}

# spaces will be ignored when sorting using locale. That's why we have
# cache of lines with spaces replaced by exclamation mark (!) so that
# sort order is strict and not dictionary. For more info, see:
# http://archives.postgresql.org/pgsql-sql/2002-04/msg00266.php
# http://groups.google.com/groups?selm=handler.82819.D82819.99045085113033.ackdone%40bugs.debian.org&output=gplain

my %locale_space_fix;

foreach (sort {
                unless($locale_space_fix{$a}) {
                        my $tmp = $a;
                        $tmp =~ s/ /!/g;
                        $locale_space_fix{$a} = lc($tmp);
                }
                unless($locale_space_fix{$b}) {
                        my $tmp = $b;
                        $tmp =~ s/ /!/g;
                        $locale_space_fix{$b} = lc($tmp);
                }
                $locale_space_fix{$a} cmp $locale_space_fix{$b};
        } @lines) {

        my @data = split(/\t+/,$_);

        my $headline = shift @data || die "need at least headline!";

        if (length($headline) < $min_len) {
                print STDERR "SKIP '$_': too short\n";
                next;
        }


        # split into min_len part and rest
        my ($part,$rest) = ( substr($headline,0,$min_len), substr($headline,$min_len) );

        # make part lowercase
        $part = lc($part);

        $last_part = $part if (! $last_part);

        # new part?
        if ($part ne $last_part) {
                print STDERR $last_part,"\t",$#part_arr+1,"\n" if ($debug && $#part_arr > $increase_at);
                $max_elements = $#part_arr if ($#part_arr > $max_elements);
                print "${headlines}[",escape_js($last_part),"] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
                $total += $#part_arr;
                @part_arr = ();
                $last_part = $part;
        }
        push @part_arr, "[".escape_js($headline).",".join(",",map { escape_js($_) } @data)."]";

        # break out?
        last if ($max && $total > $max);
}

print "${headlines}[",escape_js($last_part)."] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
print qq{

${headlines}.min_len = $min_len;
${headlines}.length = $total;

};

print STDERR "You have more than $increase_at elements, so you should\nincrease min_len to ",$min_len+1," or higher for performance benefit.\n" if ($max_elements > $increase_at);
1	#!/usr/bin/perl -w
2	#
3
4	use strict;
5	use locale;
6
7	# maximum entries
8	my $max = 0;
9	# minimum letters to search by
10	my $min_len = shift @ARGV;
11	$min_len = 3 unless defined($min_len);
12	# if more than x elements, warn to increase min_len
13	my $increase_at = 500;
14
15	# name of generated index
16	my $headlines = 'headlines';
17
18	my $debug = 1;
19
20	sub print_file {
21	my $f = shift \|\| return;
22	open(F, $f) \|\| die "$f: $!";
23	while(<F>) {
24	print;
25	}
26	close(F);
27	}
28
29	print qq{
30	var $headlines = new Object();
31	};
32
33	my @part_arr;
34	my $last_part = '';
35	my $total = 0;
36
37	my $max_elements = 0;
38
39	sub escape_js {
40	my $t = shift \|\| return 'undef';
41	# escape single quote and backspace
42	$t =~ s/(['\\])/\\$1/g && print STDERR "ESCAPED '$t'\n";
43	# quote string if not number
44	$t = "'$t'" unless ($t =~ m/^\d+$/);
45	return $t;
46	}
47
48	my @lines;
49
50	while(<STDIN>) {
51	chomp;
52
53	if (!m/\t/ \|\| m/\t$/) {
54	print STDERR "SKIP '$_': no tab\n";
55	next;
56	}
57
58	# remove leading spaces (which are ignored if source list was
59	# sorted using locale)
60	s/^\s+//;
61
62	push @lines, $_;
63	}
64
65	# spaces will be ignored when sorting using locale. That's why we have
66	# cache of lines with spaces replaced by exclamation mark (!) so that
67	# sort order is strict and not dictionary. For more info, see:
68	# http://archives.postgresql.org/pgsql-sql/2002-04/msg00266.php
69	# http://groups.google.com/groups?selm=handler.82819.D82819.99045085113033.ackdone%40bugs.debian.org&output=gplain
70
71	my %locale_space_fix;
72
73	foreach (sort {
74	unless($locale_space_fix{$a}) {
75	my $tmp = $a;
76	$tmp =~ s/ /!/g;
77	$locale_space_fix{$a} = lc($tmp);
78	}
79	unless($locale_space_fix{$b}) {
80	my $tmp = $b;
81	$tmp =~ s/ /!/g;
82	$locale_space_fix{$b} = lc($tmp);
83	}
84	$locale_space_fix{$a} cmp $locale_space_fix{$b};
85	} @lines) {
86
87	my @data = split(/\t+/,$_);
88
89	my $headline = shift @data \|\| die "need at least headline!";
90
91	if (length($headline) < $min_len) {
92	print STDERR "SKIP '$_': too short\n";
93	next;
94	}
95
96
97	# split into min_len part and rest
98	my ($part,$rest) = ( substr($headline,0,$min_len), substr($headline,$min_len) );
99
100	# make part lowercase
101	$part = lc($part);
102
103	$last_part = $part if (! $last_part);
104
105	# new part?
106	if ($part ne $last_part) {
107	print STDERR $last_part,"\t",$#part_arr+1,"\n" if ($debug && $#part_arr > $increase_at);
108	$max_elements = $#part_arr if ($#part_arr > $max_elements);
109	print "${headlines}[",escape_js($last_part),"] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
110	$total += $#part_arr;
111	@part_arr = ();
112	$last_part = $part;
113	}
114	push @part_arr, "[".escape_js($headline).",".join(",",map { escape_js($_) } @data)."]";
115
116	# break out?
117	last if ($max && $total > $max);
118	}
119
120	print "${headlines}[",escape_js($last_part)."] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
121	print qq{
122
123	${headlines}.min_len = $min_len;
124	${headlines}.length = $total;
125
126	};
127
128	print STDERR "You have more than $increase_at elements, so you should\nincrease min_len to ",$min_len+1," or higher for performance benefit.\n" if ($max_elements > $increase_at);