/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 8 by dpavlin, Wed Jul 21 15:44:15 2004 UTC revision 33 by dpavlin, Sun Oct 10 05:10:25 2004 UTC
# Line 1  Line 1 
1  package jsFind;  package jsFind;
2    
3  use 5.008004;  use 5.005;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    use HTML::Entities;
7    
8  our $VERSION = '0.01';  our $VERSION = '0.05';
9    
10    use Exporter 'import';
11    use Carp;
12    
13    our @ISA = qw(Exporter);
14    
15    BEGIN {
16            import 'jsFind::Node';
17    }
18    
19  =head1 NAME  =head1 NAME
20    
21  jsFind - generate index for jsFind using B-Tree  jsFind - generate index for full text search engine in JavaScript
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25    use jsFind;    use jsFind;
26      my $t = new jsFind(B => 4);
27      my $f = 1;
28      foreach my $k (qw{minima ut dolorem sapiente voluptatem}) {
29            $t->B_search(Key => $k,
30                    Data => {
31                            "path" => {
32                            t => "word $k",
33                            f => $f },
34                    },
35                    Insert => 1,
36                    Append => 1,
37            );
38      }
39    
40  =head1 DESCRIPTION  =head1 DESCRIPTION
41    
# Line 32  You don't need to use swish-e to create Line 53  You don't need to use swish-e to create
53    
54  =item *  =item *
55    
56  You can programatically (and incrementaly) create index for jsFind  you can programatically (and incrementaly) create index for jsFind
57    
58    =item *
59    
60    you can create more than one index and search them using same C<search.html>
61    page
62    
63  =back  =back
64    
65  =head1 METHODS  You can also examine examples which come as tests with this module,
66    for example C<t/04words.t> or C<t/10homer.t>.
67    
68  This module contains two packages C<jsFind> and C<jsFind::Node>.  =head2 jsFind
69    
70  =head2 jsFind methods  jsFind search engine was written by Shawn Garbett from eLucid Software.
71    The search engine itself is a small piece of JavaScript (1.2 with level 2
72    DOM). It is easily customizable to fit into a current set of HTML. This
73    JavaScript searches an XML index dataset for the appropriate links, and can
74    filter and sort the results.
75    
76    JavaScript code distributed with this module is based on version 0.0.3 which
77    was current when this module development started. Various changes where done
78    on JavaScript code to fix bugs, add features and remove warnings. For
79    complete list see C<Changes> file which comes with distribution.
80    
81  =cut  This module has been tested using C<html/test.html> with following browsers:
82    
83  use Exporter 'import';  =over 5
 use Carp;  
84    
85  our @ISA = qw(Exporter);  =item Mozilla FireFox 0.8 to 1.0
86    
87  BEGIN {  using DOM 2 C<document.implementation.createDocument>
88          import 'jsFind::Node';  
89  }  =item Internet Explorer 5.5 and 6.0
90    
91    using ActiveX C<Microsoft.XMLDOM> or C<MSXML2.DOMDocument>
92    
93    =item Konqueror 3.3
94    
95    using DOM 2 C<document.implementation.createDocument>
96    
97    =item Opera 7.54 (without Java)
98    
99    using experimental iframe implementation which is much slower than other methods.
100    
101    =back
102    
103    If searching doesn't work for your combination of operating system and
104    browser, please open C<html/test.html> file and wait a while. It will search sample
105    file included with distribution and report results. Reports with included
106    test debugging are welcomed.
107    
108    =head1 jsFind methods
109    
110  =head3 new  C<jsFind> is mode implementing methods which you, the user, are going to
111    use to create indexes.
112    
113    =head2 new
114    
115  Create new tree. Arguments are C<B> which is maximum numbers of keys in  Create new tree. Arguments are C<B> which is maximum numbers of keys in
116  each node and optional C<Root> node. Each root node may have child nodes.  each node and optional C<Root> node. Each root node may have child nodes.
# Line 82  sub new { Line 139  sub new {
139    bless { B => $B, Root => $Root } => $package;    bless { B => $B, Root => $Root } => $package;
140  }  }
141    
142  =head3 B_search  =head2 B_search
143    
144  Search, insert, append or replace data in B-Tree  Search, insert, append or replace data in B-Tree
145    
# Line 224  sub split_and_promote { Line 281  sub split_and_promote {
281    }    }
282  }  }
283    
284  =head3 B  =head2 B
285    
286  Return B (maximum number of keys)  Return B (maximum number of keys)
287    
# Line 236  sub B { Line 293  sub B {
293    $_[0]{B};    $_[0]{B};
294  }  }
295    
296  =head3 root  =head2 root
297    
298  Returns root node  Returns root node
299    
# Line 250  sub root { Line 307  sub root {
307    $self->{Root};    $self->{Root};
308  }  }
309    
310  =head3 node_overfull  =head2 node_overfull
311    
312  Returns if node is overfull  Returns if node is overfull
313    
# Line 264  sub node_overfull { Line 321  sub node_overfull {
321    $node->size > $self->B;    $node->size > $self->B;
322  }  }
323    
324  =head3 to_string  =head2 to_string
325    
326  Returns your tree as formatted string.  Returns your tree as formatted string.
327    
# Line 278  sub to_string { Line 335  sub to_string {
335    $_[0]->root->to_string;    $_[0]->root->to_string;
336  }  }
337    
338  =head3 to_dot  =head2 to_dot
339    
340  Create Graphviz graph of your tree  Create Graphviz graph of your tree
341    
# Line 296  sub to_dot { Line 353  sub to_dot {
353          return $dot;          return $dot;
354  }  }
355    
356  =head3 to_jsfind  =head2 to_jsfind
357    
358  Create xml index files for jsFind. This should be called after  Create xml index files for jsFind. This should be called after
359  your B-Tree has been filled with data.  your B-Tree has been filled with data.
# Line 305  your B-Tree has been filled with data. Line 362  your B-Tree has been filled with data.
362    
363  Returns number of nodes in created tree.  Returns number of nodes in created tree.
364    
365    There is also longer version if you want to recode your data charset
366    into different one (probably UTF-8):
367    
368     $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
369    
370    Destination encoding is UTF-8 by default, so you don't have to specify it.
371    
372     $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');
373    
374  =cut  =cut
375    
376    my $iconv;
377    my $iconv_l1;
378    
379  sub to_jsfind {  sub to_jsfind {
380          my $self = shift;          my $self = shift;
381    
382          my $path = shift || confess "to_jsfind need path to your index!";          my $path = shift || confess "to_jsfind need path to your index!";
383    
384            my ($from_cp,$to_cp) = @_;
385    
386            $to_cp ||= 'UTF-8';
387    
388            if ($from_cp && $to_cp) {
389                    $iconv = Text::Iconv->new($from_cp,$to_cp);
390            }
391            $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);
392    
393          $path .= "/" if ($path =~ /\/$/);          $path .= "/" if ($path =~ /\/$/);
394          carp "create directory for index '$path': $!" if (! -w $path);          #carp "creating directory for index '$path'" if (! -w $path);
395    
396          return $self->root->to_jsfind($path,"0");          return $self->root->to_jsfind($path,"0");
397  }  }
# Line 324  sub default_cmp { Line 402  sub default_cmp {
402    $_[0] cmp $_[1];    $_[0] cmp $_[1];
403  }  }
404    
405    =head2 _recode
406    
407    This is internal function to recode charset.
408    
409    It will also try to decode entities in data using L<HTML::Entities>.
410    
411    =cut
412    
413    sub _recode {
414            my $self = shift;
415            my $text = shift || return;
416    
417            sub _decode_html_entities {
418                    my $data = shift || return;
419                    $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
420            }
421    
422            if ($iconv) {
423                    $text = $iconv->convert($text) || $text && carp "convert problem: $text";
424                    $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
425            }
426    
427            return $text;
428    }
429    
430  #####################################################################  #####################################################################
431    
432  =head2 jsFind::Node methods  =head1 jsFind::Node methods
433    
434  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
435  each has C<k+1> subnodes, which might be null.  each has C<k+1> subnodes, which might be null.
# Line 348  use strict; Line 451  use strict;
451    
452  use Carp;  use Carp;
453  use File::Path;  use File::Path;
454    use Text::Iconv;
455    use POSIX;
456    
457    use base 'jsFind';
458    
459  my $KEYS = 0;  my $KEYS = 0;
460  my $DATA = 1;  my $DATA = 1;
461  my $SUBNODES = 2;  my $SUBNODES = 2;
462    
463  =head3 new  =head2 new
464    
465  Create New node  Create New node
466    
# Line 373  sub new { Line 480  sub new {
480    bless [@_] => $package;    bless [@_] => $package;
481  }  }
482    
483  =head3 locate_key  =head2 locate_key
484    
485  Locate key in node using linear search. This should probably be replaced  Locate key in node using linear search. This should probably be replaced
486  by binary search for better performance.  by binary search for better performance.
# Line 410  sub locate_key { Line 517  sub locate_key {
517  }  }
518    
519    
520  =head3 emptynode  =head2 emptynode
521    
522  Creates new empty node  Creates new empty node
523    
# Line 423  sub emptynode { Line 530  sub emptynode {
530    new($_[0]);                   # Pass package name, but not anything else.    new($_[0]);                   # Pass package name, but not anything else.
531  }  }
532    
533  =head3 is_empty  =head2 is_empty
534    
535  Test if node is empty  Test if node is empty
536    
# Line 437  sub is_empty { Line 544  sub is_empty {
544    !defined($self) || $#$self < 0;    !defined($self) || $#$self < 0;
545  }  }
546    
547  =head3 key  =head2 key
548    
549  Return C<$i>th key from node  Return C<$i>th key from node
550    
# Line 453  sub key { Line 560  sub key {
560     $_[0]->[$KEYS][$_[1]];     $_[0]->[$KEYS][$_[1]];
561  }  }
562    
563  =head3 data  =head2 data
564    
565  Return C<$i>th data from node  Return C<$i>th data from node
566    
# Line 466  sub data { Line 573  sub data {
573    $self->[$DATA][$n];    $self->[$DATA][$n];
574  }  }
575    
576  =head3 kdp_replace  =head2 kdp_replace
577    
578  Set key data pair for C<$i>th element in node  Set key data pair for C<$i>th element in node
579    
# Line 487  sub kdp_replace { Line 594  sub kdp_replace {
594     $self->[$DATA][$n]];     $self->[$DATA][$n]];
595  }  }
596    
597  =head3 kdp_insert  =head2 kdp_insert
598    
599    Insert key/data pair in tree
600    
601   # No return value.    $node->kdp_insert("key value" => "data value");
602    
603    No return value.
604    
605  =cut  =cut
606    
# Line 508  sub kdp_insert { Line 619  sub kdp_insert {
619    splice(@{$self->[$SUBNODES]}, $where, 0, undef);    splice(@{$self->[$SUBNODES]}, $where, 0, undef);
620  }  }
621    
622  =head3 kdp_append  =head2 kdp_append
623    
624  Adds new data keys and values to C<$i>th element in node  Adds new data keys and values to C<$i>th element in node
625    
# Line 529  sub kdp_append { Line 640  sub kdp_append {
640     $self->[$DATA][$n]];     $self->[$DATA][$n]];
641  }  }
642    
643  =head3 subnode  =head2 subnode
644    
645  Set new or return existing subnode  Set new or return existing subnode
646    
# Line 547  sub subnode { Line 658  sub subnode {
658    $self->[$SUBNODES][$n];    $self->[$SUBNODES][$n];
659  }  }
660    
661  =head3 is_leaf  =head2 is_leaf
662    
663  Test if node is leaf  Test if node is leaf
664    
# Line 560  sub is_leaf { Line 671  sub is_leaf {
671    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
672  }  }
673    
674  =head3 size  =head2 size
675    
676  Return number of keys in the node  Return number of keys in the node
677    
# Line 573  sub size { Line 684  sub size {
684    return scalar(@{$self->[$KEYS]});    return scalar(@{$self->[$KEYS]});
685  }  }
686    
687  =head3 halves  =head2 halves
688    
689    Split node into two halves so that keys C<0 .. $n-1> are in one node
690    and keys C<$n+1 ... $size> are in the other.
691    
692   # Accept an index $n    my ($left_node, $right_node, $kdp) = $node->halves($n);
  # Divide into two nodes so that keys 0 .. $n-1 are in one node  
  # and keys $n+1 ... $size are in the other.  
693    
694  =cut  =cut
695    
# Line 601  sub halves { Line 713  sub halves {
713    ($self->new(@left), $self->new(@right), \@middle);    ($self->new(@left), $self->new(@right), \@middle);
714  }  }
715    
716  =head3 to_string  =head2 to_string
717    
718  Dumps tree as string  Dumps tree as string
719    
# Line 652  sub to_string { Line 764  sub to_string {
764    
765  =end comment  =end comment
766    
767  =head3 to_dot  =head2 to_dot
768    
769  Recursivly walk nodes of tree  Recursivly walk nodes of tree
770    
# Line 691  sub to_dot { Line 803  sub to_dot {
803          $dot;          $dot;
804  }  }
805    
806  =head3 to_jsfind  =head2 to_xml
807    
808    Escape <, >, & and ", and to produce valid XML
809    
810    =cut
811    
812    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
813    my $escape_re  = join '|' => keys %escape;
814    
815    sub to_xml {
816            my $self = shift || confess "you should call to_xml as object!";
817    
818            my $d = shift || return;
819            $d = $self->SUPER::_recode($d);
820            confess "escape_re undefined!" unless ($escape_re);
821            $d =~ s/($escape_re)/$escape{$1}/g;
822            return $d;
823    }
824    
825    =head2 base62
826    
827    Convert number to base62 (used for jsFind index filenames).
828    
829     my $n = $tree->base62(50);
830    
831    =cut
832    
833    sub base62 {
834            my $self = shift;
835    
836            my $value = shift;
837    
838            confess("need non-negative number") if (! defined($value) || $value < 0);
839    
840            my @digits = qw(
841                    0 1 2 3 4 5 6 7 8 9
842                    a b c d e f g h i j k l m n o p q r s t u v w x y z
843                    A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
844            );
845    
846            my $base = scalar(@digits);
847            my $out = "";
848            my $pow = 1;
849            my $pos = 0;
850    
851    
852            if($value == 0) {
853                    return "0";
854            }
855    
856            while($value > 0) {
857                    $pos = $value % $base;
858                    $out = $digits[$pos] . $out;
859                    $value = floor($value/$base);
860                    $pow *= $base;
861            }
862    
863            return $out;
864    }
865    
866    =head2 to_jsfind
867    
868  Create jsFind xml files  Create jsFind xml files
869    
# Line 710  sub to_jsfind { Line 882  sub to_jsfind {
882          confess("path is undefined.") unless ($path);          confess("path is undefined.") unless ($path);
883          confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));          confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
884    
885            $file = $self->base62($file);
886    
887          my $nr_keys = 0;          my $nr_keys = 0;
888    
889          my ($k, $d, $s) = @$self;          my ($k, $d, $s) = @$self;
# Line 721  sub to_jsfind { Line 895  sub to_jsfind {
895                  my $key = lc($k->[$i]);                  my $key = lc($k->[$i]);
896    
897                  if ($key) {                  if ($key) {
898                          $key_xml .= qq{<k>$key</k>};                          $key_xml .= '<k>'.$self->to_xml($key).'</k>';
899                          $data_xml .= qq{<e>};                          $data_xml .= '<e>';
900          #use Data::Dumper;          #use Data::Dumper;
901          #print Dumper($d->[$i]);          #print Dumper($d->[$i]);
902                          foreach my $path (keys %{$d->[$i]}) {                          foreach my $path (keys %{$d->[$i]}) {
903                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
904                                  $nr_keys++;                                  $nr_keys++;
905                          }                          }
906                          $data_xml .= qq{</e>};                          $data_xml .= '</e>';
907                  }                  }
908    
909                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
910          }          }
911    
912          $key_xml .= "</n>";          $key_xml .= '</n>';
913          $data_xml .= "</d>";          $data_xml .= '</d>';
914    
915          if (! -e $path) {          if (! -e $path) {
916                  mkpath($path) || croak "can't create dir '$path': $!";                  mkpath($path) || croak "can't create dir '$path': $!";
# Line 763  jsFind web site L<http://www.elucidsoft. Line 937  jsFind web site L<http://www.elucidsoft.
937    
938  B-Trees in perl web site L<http://perl.plover.com/BTree/>  B-Trees in perl web site L<http://perl.plover.com/BTree/>
939    
940    This module web site L<http://www.rot13.org/~dpavlin/jsFind.html>
941    
942  =head1 AUTHORS  =head1 AUTHORS
943    
944  Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was  Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was

Legend:
Removed from v.8  
changed lines
  Added in v.33

  ViewVC Help
Powered by ViewVC 1.1.26