/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sun Jul 11 20:18:25 2004 UTC revision 35 by dpavlin, Sun Oct 24 11:13:22 2004 UTC
# Line 1  Line 1 
1  package jsFind;  package jsFind;
2    
3  use 5.008004;  use 5.005;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    use HTML::Entities;
7    
8  our $VERSION = '0.01';  our $VERSION = '0.06';
9    
10    use Exporter 'import';
11    use Carp;
12    
13    our @ISA = qw(Exporter);
14    
15    BEGIN {
16            import 'jsFind::Node';
17    }
18    
19  =head1 NAME  =head1 NAME
20    
21  jsFind - generate index for jsFind using B-Tree  jsFind - generate index for full text search engine in JavaScript
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25    use jsFind;    use jsFind;
26      my $t = new jsFind(B => 4);
27      my $f = 1;
28      foreach my $k (qw{minima ut dolorem sapiente voluptatem}) {
29            $t->B_search(Key => $k,
30                    Data => {
31                            "path" => {
32                            t => "word $k",
33                            f => $f },
34                    },
35                    Insert => 1,
36                    Append => 1,
37            );
38      }
39    
40  =head1 DESCRIPTION  =head1 DESCRIPTION
41    
# Line 32  You don't need to use swish-e to create Line 53  You don't need to use swish-e to create
53    
54  =item *  =item *
55    
56  You can programatically (and incrementaly) create index for jsFind  you can programatically (and incrementaly) create index for jsFind
57    
58    =item *
59    
60    you can create more than one index and search them using same C<search.html>
61    page
62    
63  =back  =back
64    
65  =head1 METHODS  You can also examine examples which come as tests with this module,
66    for example C<t/04words.t> or C<t/10homer.t>.
67    
68  This module contains two packages C<jsFind> and C<jsFind::Node>.  =head2 jsFind
69    
70  =head2 jsFind methods  jsFind search engine was written by Shawn Garbett from eLucid Software.
71    The search engine itself is a small piece of JavaScript (1.2 with level 2
72    DOM). It is easily customizable to fit into a current set of HTML. This
73    JavaScript searches an XML index dataset for the appropriate links, and can
74    filter and sort the results.
75    
76    JavaScript code distributed with this module is based on version 0.0.3 which
77    was current when this module development started. Various changes where done
78    on JavaScript code to fix bugs, add features and remove warnings. For
79    complete list see C<Changes> file which comes with distribution.
80    
81  =cut  This module has been tested using C<html/test.html> with following browsers:
82    
83  use Exporter 'import';  =over 5
 use Carp;  
84    
85  our @ISA = qw(Exporter);  =item Mozilla FireFox 0.8 to 1.0
86    
87  BEGIN {  using DOM 2 C<document.implementation.createDocument>
88          import 'jsFind::Node';  
89  }  =item Internet Explorer 5.5 and 6.0
90    
91    using ActiveX C<Microsoft.XMLDOM> or C<MSXML2.DOMDocument>
92    
93    =item Konqueror 3.3
94    
95  =head3 new  using DOM 2 C<document.implementation.createDocument>
96    
97    =item Opera 7.54 (without Java)
98    
99    using experimental iframe implementation which is much slower than other methods.
100    
101    =back
102    
103    If searching doesn't work for your combination of operating system and
104    browser, please open C<html/test.html> file and wait a while. It will search sample
105    file included with distribution and report results. Reports with included
106    test debugging are welcomed.
107    
108    =head1 jsFind methods
109    
110    C<jsFind> is mode implementing methods which you, the user, are going to
111    use to create indexes.
112    
113    =head2 new
114    
115  Create new tree. Arguments are C<B> which is maximum numbers of keys in  Create new tree. Arguments are C<B> which is maximum numbers of keys in
116  each node and optional C<Root> node. Each root node may have child nodes.  each node and optional C<Root> node. Each root node may have child nodes.
# Line 82  sub new { Line 139  sub new {
139    bless { B => $B, Root => $Root } => $package;    bless { B => $B, Root => $Root } => $package;
140  }  }
141    
142  =head3 B_search  =head2 B_search
143    
144  Search, insert, append or replace data in B-Tree  Search, insert, append or replace data in B-Tree
145    
146     $t->B_search(
147            Key => 'key value',
148            Data => { "path" => {
149                            "t" => "title of document",
150                            "f" => 99,
151                            },
152                    },
153            Insert => 1,
154            Append => 1,
155     );
156    
157  Semantics:  Semantics:
158    
# Line 215  sub split_and_promote { Line 281  sub split_and_promote {
281    }    }
282  }  }
283    
284  =head3 B  =head2 B
285    
286  Return B (maximum number of keys)  Return B (maximum number of keys)
287    
# Line 227  sub B { Line 293  sub B {
293    $_[0]{B};    $_[0]{B};
294  }  }
295    
296  =head3 root  =head2 root
297    
298  Returns root node  Returns root node
299    
# Line 241  sub root { Line 307  sub root {
307    $self->{Root};    $self->{Root};
308  }  }
309    
310  =head3 node_overfull  =head2 node_overfull
311    
312  Returns if node is overfull  Returns if node is overfull
313    
# Line 255  sub node_overfull { Line 321  sub node_overfull {
321    $node->size > $self->B;    $node->size > $self->B;
322  }  }
323    
324  =head3 to_string  =head2 to_string
325    
326  Returns your tree as formatted string.  Returns your tree as formatted string.
327    
# Line 269  sub to_string { Line 335  sub to_string {
335    $_[0]->root->to_string;    $_[0]->root->to_string;
336  }  }
337    
338  =head3 to_dot  =head2 to_dot
339    
340  Create Graphviz graph of your tree  Create Graphviz graph of your tree
341    
# Line 287  sub to_dot { Line 353  sub to_dot {
353          return $dot;          return $dot;
354  }  }
355    
356  =head3 to_jsfind  =head2 to_jsfind
357    
358  Create xml index files for jsFind. This should be called after  Create xml index files for jsFind. This should be called after
359  your B-Tree has been filled with data.  your B-Tree has been filled with data.
360    
361   $root->to_jsfind('/full/path/to/index/dir/');   $root->to_jsfind(
362            dir => '/full/path/to/index/dir/',
363            data_codepage => 'ISO-8859-2',
364            index_codepage => 'UTF-8',
365            output_filter => sub {
366                    my $t = shift || return;
367                    $t =~ s/&egrave;/e/;
368            }
369     );
370    
371    All options except C<dir> are optional.
372    
373  Returns number of nodes in created tree.  Returns number of nodes in created tree.
374    
375    Options:
376    
377    =over 4
378    
379    =item dir
380    
381    Full path to directory for index (which will be created if needed).
382    
383    =item data_codepage
384    
385    If your imput data isn't in C<ISO-8859-1> encoding, you will have to specify
386    this option.
387    
388    =item index_codepage
389    
390    If your index encoding is not C<UTF-8> use this option.
391    
392    If you are not using supplied JavaScript search code, or your browser is
393    terribly broken and thinks that index shouldn't be in UTF-8 encoding, use
394    this option to specify encoding for created XML index.
395    
396    =item output_filter
397    
398    B<this is just draft of documentation for option which is not implemented!>
399    
400    Code ref to sub which can do modifications on resulting XML file for node.
401    Encoding of this data will be in L<index_codepage> and you have to take care
402    not to break XML structure. Calling L<xmllint> on your result index
403    (like C<t/90xmllint.t> does in this distribution) is a good idea after using
404    this option.
405    
406    This option is also right place to plug in unaccenting function using
407    L<Text::Unaccent>.
408    
409    =back
410    
411  =cut  =cut
412    
413    my $iconv;
414    my $iconv_l1;
415    
416  sub to_jsfind {  sub to_jsfind {
417          my $self = shift;          my $self = shift;
418    
419          my $path = shift || confess "to_jsfind need path to your index!";          my %arg = @_;
420    
421            confess "to_jsfind need path to your index directory !" unless ($arg{'dir'});
422    
423          $path .= "/" if ($path =~ /\/$/);          my $data_codepage = $arg{'data_codepage'};
424          carp "can't create index in '$path': $!" if (! -w $path);          my $index_codepage = $arg{'index_codepage'} || 'UTF-8';
425    
426          return $self->root->to_jsfind($path,"0");          # create ISO-8859-1 iconv for HTML::Entities decode
427            $iconv_l1 = Text::Iconv->new('ISO-8859-1',$index_codepage);
428    
429            # create another iconv for data
430            if ($data_codepage && $index_codepage) {
431                    $iconv = Text::Iconv->new($data_codepage,$index_codepage);
432            }
433    
434            return $self->root->to_jsfind($arg{'dir'},"0");
435  }  }
436    
437    
# Line 315  sub default_cmp { Line 440  sub default_cmp {
440    $_[0] cmp $_[1];    $_[0] cmp $_[1];
441  }  }
442    
443    =head2 _recode
444    
445    This is internal function to recode charset.
446    
447    It will also try to decode entities in data using L<HTML::Entities>.
448    
449    =cut
450    
451    sub _recode {
452            my $self = shift;
453            my $text = shift || return;
454    
455            sub _decode_html_entities {
456                    my $data = shift || return;
457                    $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
458            }
459    
460            if ($iconv) {
461                    $text = $iconv->convert($text) || $text && carp "convert problem: $text";
462                    $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
463            }
464    
465            return $text;
466    }
467    
468  #####################################################################  #####################################################################
469    
470  =head2 jsFind::Node methods  =head1 jsFind::Node methods
471    
472  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
473  each has C<k+1> subnodes, which might be null.  each has C<k+1> subnodes, which might be null.
# Line 339  use strict; Line 489  use strict;
489    
490  use Carp;  use Carp;
491  use File::Path;  use File::Path;
492    use Text::Iconv;
493    use POSIX;
494    
495    use base 'jsFind';
496    
497  my $KEYS = 0;  my $KEYS = 0;
498  my $DATA = 1;  my $DATA = 1;
499  my $SUBNODES = 2;  my $SUBNODES = 2;
500    
501  =head3 new  =head2 new
502    
503  Create New node  Create New node
504    
# Line 364  sub new { Line 518  sub new {
518    bless [@_] => $package;    bless [@_] => $package;
519  }  }
520    
521  =head3 locate_key  =head2 locate_key
522    
523  Locate key in node using linear search. This should probably be replaced  Locate key in node using linear search. This should probably be replaced
524  by binary search for better performance.  by binary search for better performance.
# Line 401  sub locate_key { Line 555  sub locate_key {
555  }  }
556    
557    
558  =head3 emptynode  =head2 emptynode
559    
560  Creates new empty node  Creates new empty node
561    
# Line 414  sub emptynode { Line 568  sub emptynode {
568    new($_[0]);                   # Pass package name, but not anything else.    new($_[0]);                   # Pass package name, but not anything else.
569  }  }
570    
571  =head3 is_empty  =head2 is_empty
572    
573  Test if node is empty  Test if node is empty
574    
# Line 428  sub is_empty { Line 582  sub is_empty {
582    !defined($self) || $#$self < 0;    !defined($self) || $#$self < 0;
583  }  }
584    
585  =head3 key  =head2 key
586    
587  Return C<$i>th key from node  Return C<$i>th key from node
588    
# Line 444  sub key { Line 598  sub key {
598     $_[0]->[$KEYS][$_[1]];     $_[0]->[$KEYS][$_[1]];
599  }  }
600    
601  =head3 data  =head2 data
602    
603  Return C<$i>th data from node  Return C<$i>th data from node
604    
# Line 457  sub data { Line 611  sub data {
611    $self->[$DATA][$n];    $self->[$DATA][$n];
612  }  }
613    
614  =head3 kdp_replace  =head2 kdp_replace
615    
616  Set key data pair for C<$i>th element in node  Set key data pair for C<$i>th element in node
617    
# Line 478  sub kdp_replace { Line 632  sub kdp_replace {
632     $self->[$DATA][$n]];     $self->[$DATA][$n]];
633  }  }
634    
635  =head3 kdp_insert  =head2 kdp_insert
636    
637    Insert key/data pair in tree
638    
639   # No return value.    $node->kdp_insert("key value" => "data value");
640    
641    No return value.
642    
643  =cut  =cut
644    
# Line 499  sub kdp_insert { Line 657  sub kdp_insert {
657    splice(@{$self->[$SUBNODES]}, $where, 0, undef);    splice(@{$self->[$SUBNODES]}, $where, 0, undef);
658  }  }
659    
660  =head3 kdp_append  =head2 kdp_append
661    
662  Adds new data keys and values to C<$i>th element in node  Adds new data keys and values to C<$i>th element in node
663    
# Line 520  sub kdp_append { Line 678  sub kdp_append {
678     $self->[$DATA][$n]];     $self->[$DATA][$n]];
679  }  }
680    
681  =head3 subnode  =head2 subnode
682    
683  Set new or return existing subnode  Set new or return existing subnode
684    
# Line 538  sub subnode { Line 696  sub subnode {
696    $self->[$SUBNODES][$n];    $self->[$SUBNODES][$n];
697  }  }
698    
699  =head3 is_leaf  =head2 is_leaf
700    
701  Test if node is leaf  Test if node is leaf
702    
# Line 551  sub is_leaf { Line 709  sub is_leaf {
709    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
710  }  }
711    
712  =head3 size  =head2 size
713    
714  Return number of keys in the node  Return number of keys in the node
715    
# Line 564  sub size { Line 722  sub size {
722    return scalar(@{$self->[$KEYS]});    return scalar(@{$self->[$KEYS]});
723  }  }
724    
725  =head3 halves  =head2 halves
726    
727    Split node into two halves so that keys C<0 .. $n-1> are in one node
728    and keys C<$n+1 ... $size> are in the other.
729    
730   # Accept an index $n    my ($left_node, $right_node, $kdp) = $node->halves($n);
  # Divide into two nodes so that keys 0 .. $n-1 are in one node  
  # and keys $n+1 ... $size are in the other.  
731    
732  =cut  =cut
733    
# Line 592  sub halves { Line 751  sub halves {
751    ($self->new(@left), $self->new(@right), \@middle);    ($self->new(@left), $self->new(@right), \@middle);
752  }  }
753    
754  =head3 to_string  =head2 to_string
755    
756  Dumps tree as string  Dumps tree as string
757    
# Line 643  sub to_string { Line 802  sub to_string {
802    
803  =end comment  =end comment
804    
805  =head3 to_dot  =head2 to_dot
806    
807  Recursivly walk nodes of tree  Recursivly walk nodes of tree
808    
# Line 682  sub to_dot { Line 841  sub to_dot {
841          $dot;          $dot;
842  }  }
843    
844  =head3 to_jsfind  =head2 to_xml
845    
846    Escape <, >, & and ", and to produce valid XML
847    
848    =cut
849    
850    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
851    my $escape_re  = join '|' => keys %escape;
852    
853    sub to_xml {
854            my $self = shift || confess "you should call to_xml as object!";
855    
856            my $d = shift || return;
857            $d = $self->SUPER::_recode($d);
858            confess "escape_re undefined!" unless ($escape_re);
859            $d =~ s/($escape_re)/$escape{$1}/g;
860            return $d;
861    }
862    
863    =head2 base62
864    
865    Convert number to base62 (used for jsFind index filenames).
866    
867     my $n = $tree->base62(50);
868    
869    =cut
870    
871    sub base62 {
872            my $self = shift;
873    
874            my $value = shift;
875    
876            confess("need non-negative number") if (! defined($value) || $value < 0);
877    
878            my @digits = qw(
879                    0 1 2 3 4 5 6 7 8 9
880                    a b c d e f g h i j k l m n o p q r s t u v w x y z
881                    A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
882            );
883    
884            my $base = scalar(@digits);
885            my $out = "";
886            my $pow = 1;
887            my $pos = 0;
888    
889    
890            if($value == 0) {
891                    return "0";
892            }
893    
894            while($value > 0) {
895                    $pos = $value % $base;
896                    $out = $digits[$pos] . $out;
897                    $value = floor($value/$base);
898                    $pow *= $base;
899            }
900    
901            return $out;
902    }
903    
904    =head2 to_jsfind
905    
906  Create jsFind xml files  Create jsFind xml files
907    
908   my $nr=$tree->to_dot('/path/to/index','0');   my $nr=$tree->to_jsfind('/path/to/index','0');
909    
910  Returns number of elements created  Returns number of elements created
911    
# Line 698  sub to_jsfind { Line 917  sub to_jsfind {
917    
918          return 0 if $self->is_empty;          return 0 if $self->is_empty;
919    
920            confess("path is undefined.") unless ($path);
921            confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
922    
923            $file = $self->base62($file);
924    
925          my $nr_keys = 0;          my $nr_keys = 0;
926    
927          my ($k, $d, $s) = @$self;          my ($k, $d, $s) = @$self;
# Line 709  sub to_jsfind { Line 933  sub to_jsfind {
933                  my $key = lc($k->[$i]);                  my $key = lc($k->[$i]);
934    
935                  if ($key) {                  if ($key) {
936                          $key_xml .= qq{<k>$key</k>};                          $key_xml .= '<k>'.$self->to_xml($key).'</k>';
937                          $data_xml .= qq{<e>};                          $data_xml .= '<e>';
938          #use Data::Dumper;          #use Data::Dumper;
939          #print Dumper($d->[$i]);          #print Dumper($d->[$i]);
940                          foreach my $path (keys %{$d->[$i]}) {                          foreach my $path (keys %{$d->[$i]}) {
941                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
942                                  $nr_keys++;                                  $nr_keys++;
943                          }                          }
944                          $data_xml .= qq{</e>};                          $data_xml .= '</e>';
945                  }                  }
946    
947                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
948          }          }
949    
950          $key_xml .= "</n>";          $key_xml .= '</n>';
951          $data_xml .= "</d>";          $data_xml .= '</d>';
952    
953          if (! -e $path) {          if (! -e $path) {
954                  mkpath($path) || croak "can't create dir '$path': $!";                  mkpath($path) || croak "can't create dir '$path': $!";
# Line 751  jsFind web site L<http://www.elucidsoft. Line 975  jsFind web site L<http://www.elucidsoft.
975    
976  B-Trees in perl web site L<http://perl.plover.com/BTree/>  B-Trees in perl web site L<http://perl.plover.com/BTree/>
977    
978    This module web site L<http://www.rot13.org/~dpavlin/jsFind.html>
979    
980  =head1 AUTHORS  =head1 AUTHORS
981    
982  Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was  Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was

Legend:
Removed from v.1  
changed lines
  Added in v.35

  ViewVC Help
Powered by ViewVC 1.1.26