/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 5 by dpavlin, Tue Jul 20 17:47:30 2004 UTC revision 12 by dpavlin, Sat Aug 28 14:31:58 2004 UTC
# Line 3  package jsFind; Line 3  package jsFind;
3  use 5.008004;  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    use HTML::Entities;
7    
8  our $VERSION = '0.01';  our $VERSION = '0.03';
9    
10    use Exporter 'import';
11    use Carp;
12    
13    our @ISA = qw(Exporter);
14    
15    BEGIN {
16            import 'jsFind::Node';
17    }
18    
19  =head1 NAME  =head1 NAME
20    
# Line 13  jsFind - generate index for jsFind using Line 23  jsFind - generate index for jsFind using
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25    use jsFind;    use jsFind;
26      my $t = new jsFind(B => 4);
27      my $f = 1;
28      foreach my $k (qw{minima ut dolorem sapiente voluptatem}) {
29            $t->B_search(Key => $k,
30                    Data => {
31                            "path" => {
32                            t => "word $k",
33                            f => $f },
34                    },
35                    Insert => 1,
36                    Append => 1,
37            );
38      }
39    
40  =head1 DESCRIPTION  =head1 DESCRIPTION
41    
# Line 36  You can programatically (and incremental Line 57  You can programatically (and incremental
57    
58  =back  =back
59    
60  =head1 METHODS  =head1 jsFind methods
   
 This module contains two packages C<jsFind> and C<jsFind::Node>.  
   
 =head2 jsFind methods  
   
 =cut  
   
 use Exporter 'import';  
 use Carp;  
61    
62  our @ISA = qw(Exporter);  C<jsFind> is mode implementing methods which you, the user, are going to
63    use to create indexes.
64    
65  BEGIN {  =head2 new
         import 'jsFind::Node';  
 }  
   
 =head3 new  
66    
67  Create new tree. Arguments are C<B> which is maximum numbers of keys in  Create new tree. Arguments are C<B> which is maximum numbers of keys in
68  each node and optional C<Root> node. Each root node may have child nodes.  each node and optional C<Root> node. Each root node may have child nodes.
# Line 82  sub new { Line 91  sub new {
91    bless { B => $B, Root => $Root } => $package;    bless { B => $B, Root => $Root } => $package;
92  }  }
93    
94  =head3 B_search  =head2 B_search
95    
96  Search, insert, append or replace data in B-Tree  Search, insert, append or replace data in B-Tree
97    
# Line 91  Search, insert, append or replace data i Line 100  Search, insert, append or replace data i
100          Data => { "path" => {          Data => { "path" => {
101                          "t" => "title of document",                          "t" => "title of document",
102                          "f" => 99,                          "f" => 99,
103                            },
104                  },                  },
105          Insert => 1,          Insert => 1,
106          Append => 1,          Append => 1,
# Line 223  sub split_and_promote { Line 233  sub split_and_promote {
233    }    }
234  }  }
235    
236  =head3 B  =head2 B
237    
238  Return B (maximum number of keys)  Return B (maximum number of keys)
239    
# Line 235  sub B { Line 245  sub B {
245    $_[0]{B};    $_[0]{B};
246  }  }
247    
248  =head3 root  =head2 root
249    
250  Returns root node  Returns root node
251    
# Line 249  sub root { Line 259  sub root {
259    $self->{Root};    $self->{Root};
260  }  }
261    
262  =head3 node_overfull  =head2 node_overfull
263    
264  Returns if node is overfull  Returns if node is overfull
265    
# Line 263  sub node_overfull { Line 273  sub node_overfull {
273    $node->size > $self->B;    $node->size > $self->B;
274  }  }
275    
276  =head3 to_string  =head2 to_string
277    
278  Returns your tree as formatted string.  Returns your tree as formatted string.
279    
# Line 277  sub to_string { Line 287  sub to_string {
287    $_[0]->root->to_string;    $_[0]->root->to_string;
288  }  }
289    
290  =head3 to_dot  =head2 to_dot
291    
292  Create Graphviz graph of your tree  Create Graphviz graph of your tree
293    
# Line 295  sub to_dot { Line 305  sub to_dot {
305          return $dot;          return $dot;
306  }  }
307    
308  =head3 to_jsfind  =head2 to_jsfind
309    
310  Create xml index files for jsFind. This should be called after  Create xml index files for jsFind. This should be called after
311  your B-Tree has been filled with data.  your B-Tree has been filled with data.
# Line 304  your B-Tree has been filled with data. Line 314  your B-Tree has been filled with data.
314    
315  Returns number of nodes in created tree.  Returns number of nodes in created tree.
316    
317    There is also longer version if you want to recode your data charset
318    into different one (probably UTF-8):
319    
320     $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
321    
322    Destination encoding is UTF-8 by default, so you don't have to specify it.
323    
324     $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');
325    
326  =cut  =cut
327    
328    my $iconv;
329    my $iconv_l1;
330    
331  sub to_jsfind {  sub to_jsfind {
332          my $self = shift;          my $self = shift;
333    
334          my $path = shift || confess "to_jsfind need path to your index!";          my $path = shift || confess "to_jsfind need path to your index!";
335    
336            my ($from_cp,$to_cp) = @_;
337    
338            $to_cp ||= 'UTF-8';
339    
340            if ($from_cp && $to_cp) {
341                    $iconv = Text::Iconv->new($from_cp,$to_cp);
342            }
343            $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);
344    
345          $path .= "/" if ($path =~ /\/$/);          $path .= "/" if ($path =~ /\/$/);
346          carp "can't create index in '$path': $!" if (! -w $path);          #carp "creating directory for index '$path'" if (! -w $path);
347    
348          return $self->root->to_jsfind($path,"0");          return $self->root->to_jsfind($path,"0");
349  }  }
# Line 323  sub default_cmp { Line 354  sub default_cmp {
354    $_[0] cmp $_[1];    $_[0] cmp $_[1];
355  }  }
356    
357    =head2 _recode
358    
359    This is internal function to recode charset.
360    
361    It will also try to decode entities in data using L<HTML::Entities>.
362    
363    =cut
364    
365    sub _recode {
366            my $self = shift;
367            my $text = shift || return;
368    
369            sub _decode_html_entities {
370                    my $data = shift || return;
371                    $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
372            }
373    
374            if ($iconv) {
375                    $text = $iconv->convert($text) || $text && carp "convert problem: $text";
376                    $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
377            }
378    
379            return $text;
380    }
381    
382  #####################################################################  #####################################################################
383    
384  =head2 jsFind::Node methods  =head1 jsFind::Node methods
385    
386  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
387  each has C<k+1> subnodes, which might be null.  each has C<k+1> subnodes, which might be null.
# Line 347  use strict; Line 403  use strict;
403    
404  use Carp;  use Carp;
405  use File::Path;  use File::Path;
406    use Text::Iconv;
407    
408    use base 'jsFind';
409    
410  my $KEYS = 0;  my $KEYS = 0;
411  my $DATA = 1;  my $DATA = 1;
412  my $SUBNODES = 2;  my $SUBNODES = 2;
413    
414  =head3 new  =head2 new
415    
416  Create New node  Create New node
417    
# Line 372  sub new { Line 431  sub new {
431    bless [@_] => $package;    bless [@_] => $package;
432  }  }
433    
434  =head3 locate_key  =head2 locate_key
435    
436  Locate key in node using linear search. This should probably be replaced  Locate key in node using linear search. This should probably be replaced
437  by binary search for better performance.  by binary search for better performance.
# Line 409  sub locate_key { Line 468  sub locate_key {
468  }  }
469    
470    
471  =head3 emptynode  =head2 emptynode
472    
473  Creates new empty node  Creates new empty node
474    
# Line 422  sub emptynode { Line 481  sub emptynode {
481    new($_[0]);                   # Pass package name, but not anything else.    new($_[0]);                   # Pass package name, but not anything else.
482  }  }
483    
484  =head3 is_empty  =head2 is_empty
485    
486  Test if node is empty  Test if node is empty
487    
# Line 436  sub is_empty { Line 495  sub is_empty {
495    !defined($self) || $#$self < 0;    !defined($self) || $#$self < 0;
496  }  }
497    
498  =head3 key  =head2 key
499    
500  Return C<$i>th key from node  Return C<$i>th key from node
501    
# Line 452  sub key { Line 511  sub key {
511     $_[0]->[$KEYS][$_[1]];     $_[0]->[$KEYS][$_[1]];
512  }  }
513    
514  =head3 data  =head2 data
515    
516  Return C<$i>th data from node  Return C<$i>th data from node
517    
# Line 465  sub data { Line 524  sub data {
524    $self->[$DATA][$n];    $self->[$DATA][$n];
525  }  }
526    
527  =head3 kdp_replace  =head2 kdp_replace
528    
529  Set key data pair for C<$i>th element in node  Set key data pair for C<$i>th element in node
530    
# Line 486  sub kdp_replace { Line 545  sub kdp_replace {
545     $self->[$DATA][$n]];     $self->[$DATA][$n]];
546  }  }
547    
548  =head3 kdp_insert  =head2 kdp_insert
549    
550    Insert key/data pair in tree
551    
552      $node->kdp_insert("key value" => "data value");
553    
554   # No return value.  No return value.
555    
556  =cut  =cut
557    
# Line 507  sub kdp_insert { Line 570  sub kdp_insert {
570    splice(@{$self->[$SUBNODES]}, $where, 0, undef);    splice(@{$self->[$SUBNODES]}, $where, 0, undef);
571  }  }
572    
573  =head3 kdp_append  =head2 kdp_append
574    
575  Adds new data keys and values to C<$i>th element in node  Adds new data keys and values to C<$i>th element in node
576    
# Line 528  sub kdp_append { Line 591  sub kdp_append {
591     $self->[$DATA][$n]];     $self->[$DATA][$n]];
592  }  }
593    
594  =head3 subnode  =head2 subnode
595    
596  Set new or return existing subnode  Set new or return existing subnode
597    
# Line 546  sub subnode { Line 609  sub subnode {
609    $self->[$SUBNODES][$n];    $self->[$SUBNODES][$n];
610  }  }
611    
612  =head3 is_leaf  =head2 is_leaf
613    
614  Test if node is leaf  Test if node is leaf
615    
# Line 559  sub is_leaf { Line 622  sub is_leaf {
622    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
623  }  }
624    
625  =head3 size  =head2 size
626    
627  Return number of keys in the node  Return number of keys in the node
628    
# Line 572  sub size { Line 635  sub size {
635    return scalar(@{$self->[$KEYS]});    return scalar(@{$self->[$KEYS]});
636  }  }
637    
638  =head3 halves  =head2 halves
639    
640   # Accept an index $n  Split node into two halves so that keys C<0 .. $n-1> are in one node
641   # Divide into two nodes so that keys 0 .. $n-1 are in one node  and keys C<$n+1 ... $size> are in the other.
642   # and keys $n+1 ... $size are in the other.  
643      my ($left_node, $right_node, $kdp) = $node->halves($n);
644    
645  =cut  =cut
646    
# Line 600  sub halves { Line 664  sub halves {
664    ($self->new(@left), $self->new(@right), \@middle);    ($self->new(@left), $self->new(@right), \@middle);
665  }  }
666    
667  =head3 to_string  =head2 to_string
668    
669  Dumps tree as string  Dumps tree as string
670    
# Line 651  sub to_string { Line 715  sub to_string {
715    
716  =end comment  =end comment
717    
718  =head3 to_dot  =head2 to_dot
719    
720  Recursivly walk nodes of tree  Recursivly walk nodes of tree
721    
# Line 690  sub to_dot { Line 754  sub to_dot {
754          $dot;          $dot;
755  }  }
756    
757  =head3 to_jsfind  =head2 to_xml
758    
759    Escape <, >, & and ", and to produce valid XML
760    
761    =cut
762    
763    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
764    my $escape_re  = join '|' => keys %escape;
765    
766    sub to_xml {
767            my $self = shift || confess "you should call to_xml as object!";
768    
769            my $d = shift || return;
770            $d = $self->SUPER::_recode($d);
771            confess "escape_re undefined!" unless ($escape_re);
772            $d =~ s/($escape_re)/$escape{$1}/g;
773            return $d;
774    }
775    
776    =head2 to_jsfind
777    
778  Create jsFind xml files  Create jsFind xml files
779    
780   my $nr=$tree->to_dot('/path/to/index','0');   my $nr=$tree->to_jsfind('/path/to/index','0');
781    
782  Returns number of elements created  Returns number of elements created
783    
784  =cut  =cut
785    
786    
787  sub to_jsfind {  sub to_jsfind {
788          my $self = shift;          my $self = shift;
789          my ($path,$file) = @_;          my ($path,$file) = @_;
790    
791          return 0 if $self->is_empty;          return 0 if $self->is_empty;
792    
793            confess("path is undefined.") unless ($path);
794            confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
795    
796          my $nr_keys = 0;          my $nr_keys = 0;
797    
798          my ($k, $d, $s) = @$self;          my ($k, $d, $s) = @$self;
# Line 717  sub to_jsfind { Line 804  sub to_jsfind {
804                  my $key = lc($k->[$i]);                  my $key = lc($k->[$i]);
805    
806                  if ($key) {                  if ($key) {
807                          $key_xml .= qq{<k>$key</k>};                          $key_xml .= '<k>'.$self->to_xml($key).'</k>';
808                          $data_xml .= qq{<e>};                          $data_xml .= '<e>';
809          #use Data::Dumper;          #use Data::Dumper;
810          #print Dumper($d->[$i]);          #print Dumper($d->[$i]);
811                          foreach my $path (keys %{$d->[$i]}) {                          foreach my $path (keys %{$d->[$i]}) {
812                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
813                                  $nr_keys++;                                  $nr_keys++;
814                          }                          }
815                          $data_xml .= qq{</e>};                          $data_xml .= '</e>';
816                  }                  }
817    
818                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
819          }          }
820    
821          $key_xml .= "</n>";          $key_xml .= '</n>';
822          $data_xml .= "</d>";          $data_xml .= '</d>';
823    
824          if (! -e $path) {          if (! -e $path) {
825                  mkpath($path) || croak "can't create dir '$path': $!";                  mkpath($path) || croak "can't create dir '$path': $!";

Legend:
Removed from v.5  
changed lines
  Added in v.12

  ViewVC Help
Powered by ViewVC 1.1.26