/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 8 by dpavlin, Wed Jul 21 15:44:15 2004 UTC revision 36 by dpavlin, Sat Oct 30 20:50:39 2004 UTC
# Line 1  Line 1 
1  package jsFind;  package jsFind;
2    
3  use 5.008004;  use 5.005;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    use HTML::Entities;
7    
8  our $VERSION = '0.01';  our $VERSION = '0.06';
9    
10    use Exporter 'import';
11    use Carp;
12    
13    our @ISA = qw(Exporter);
14    
15    BEGIN {
16            import 'jsFind::Node';
17    }
18    
19  =head1 NAME  =head1 NAME
20    
21  jsFind - generate index for jsFind using B-Tree  jsFind - generate index for full text search engine in JavaScript
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25    use jsFind;    use jsFind;
26      my $t = new jsFind(B => 4);
27      my $f = 1;
28      foreach my $k (qw{minima ut dolorem sapiente voluptatem}) {
29            $t->B_search(Key => $k,
30                    Data => {
31                            "path" => {
32                            t => "word $k",
33                            f => $f },
34                    },
35                    Insert => 1,
36                    Append => 1,
37            );
38      }
39    
40  =head1 DESCRIPTION  =head1 DESCRIPTION
41    
# Line 32  You don't need to use swish-e to create Line 53  You don't need to use swish-e to create
53    
54  =item *  =item *
55    
56  You can programatically (and incrementaly) create index for jsFind  you can programatically (and incrementaly) create index for jsFind
57    
58    =item *
59    
60    you can create more than one index and search them using same C<search.html>
61    page
62    
63  =back  =back
64    
65  =head1 METHODS  You can also examine examples which come as tests with this module,
66    for example C<t/04words.t> or C<t/10homer.t>.
67    
68  This module contains two packages C<jsFind> and C<jsFind::Node>.  =head2 jsFind
69    
70  =head2 jsFind methods  jsFind search engine was written by Shawn Garbett from eLucid Software.
71    The search engine itself is a small piece of JavaScript (1.2 with level 2
72    DOM). It is easily customizable to fit into a current set of HTML. This
73    JavaScript searches an XML index dataset for the appropriate links, and can
74    filter and sort the results.
75    
76    JavaScript code distributed with this module is based on version 0.0.3 which
77    was current when this module development started. Various changes where done
78    on JavaScript code to fix bugs, add features and remove warnings. For
79    complete list see C<Changes> file which comes with distribution.
80    
81  =cut  This module has been tested using C<html/test.html> with following browsers:
82    
83  use Exporter 'import';  =over 5
 use Carp;  
84    
85  our @ISA = qw(Exporter);  =item Mozilla FireFox 0.8 to 1.0
86    
87  BEGIN {  using DOM 2 C<document.implementation.createDocument>
88          import 'jsFind::Node';  
89  }  =item Internet Explorer 5.5 and 6.0
90    
91    using ActiveX C<Microsoft.XMLDOM> or C<MSXML2.DOMDocument>
92    
93    =item Konqueror 3.3
94    
95  =head3 new  using DOM 2 C<document.implementation.createDocument>
96    
97    =item Opera 7.54 (without Java)
98    
99    using experimental iframe implementation which is much slower than other methods.
100    
101    =back
102    
103    If searching doesn't work for your combination of operating system and
104    browser, please open C<html/test.html> file and wait a while. It will search sample
105    file included with distribution and report results. Reports with included
106    test debugging are welcomed.
107    
108    =head1 jsFind methods
109    
110    C<jsFind> is mode implementing methods which you, the user, are going to
111    use to create indexes.
112    
113    =head2 new
114    
115  Create new tree. Arguments are C<B> which is maximum numbers of keys in  Create new tree. Arguments are C<B> which is maximum numbers of keys in
116  each node and optional C<Root> node. Each root node may have child nodes.  each node and optional C<Root> node. Each root node may have child nodes.
# Line 82  sub new { Line 139  sub new {
139    bless { B => $B, Root => $Root } => $package;    bless { B => $B, Root => $Root } => $package;
140  }  }
141    
142  =head3 B_search  =head2 B_search
143    
144  Search, insert, append or replace data in B-Tree  Search, insert, append or replace data in B-Tree
145    
# Line 224  sub split_and_promote { Line 281  sub split_and_promote {
281    }    }
282  }  }
283    
284  =head3 B  =head2 B
285    
286  Return B (maximum number of keys)  Return B (maximum number of keys)
287    
# Line 236  sub B { Line 293  sub B {
293    $_[0]{B};    $_[0]{B};
294  }  }
295    
296  =head3 root  =head2 root
297    
298  Returns root node  Returns root node
299    
# Line 250  sub root { Line 307  sub root {
307    $self->{Root};    $self->{Root};
308  }  }
309    
310  =head3 node_overfull  =head2 node_overfull
311    
312  Returns if node is overfull  Returns if node is overfull
313    
# Line 264  sub node_overfull { Line 321  sub node_overfull {
321    $node->size > $self->B;    $node->size > $self->B;
322  }  }
323    
324  =head3 to_string  =head2 to_string
325    
326  Returns your tree as formatted string.  Returns your tree as formatted string.
327    
# Line 278  sub to_string { Line 335  sub to_string {
335    $_[0]->root->to_string;    $_[0]->root->to_string;
336  }  }
337    
338  =head3 to_dot  =head2 to_dot
339    
340  Create Graphviz graph of your tree  Create Graphviz graph of your tree
341    
# Line 296  sub to_dot { Line 353  sub to_dot {
353          return $dot;          return $dot;
354  }  }
355    
356  =head3 to_jsfind  =head2 to_jsfind
357    
358  Create xml index files for jsFind. This should be called after  Create xml index files for jsFind. This should be called after
359  your B-Tree has been filled with data.  your B-Tree has been filled with data.
360    
361   $root->to_jsfind('/full/path/to/index/dir/');   $root->to_jsfind(
362            dir => '/full/path/to/index/dir/',
363            data_codepage => 'ISO-8859-2',
364            index_codepage => 'UTF-8',
365            output_filter => sub {
366                    my $t = shift || return;
367                    $t =~ s/&egrave;/e/;
368            }
369     );
370    
371    All options except C<dir> are optional.
372    
373  Returns number of nodes in created tree.  Returns number of nodes in created tree.
374    
375    Options:
376    
377    =over 4
378    
379    =item dir
380    
381    Full path to directory for index (which will be created if needed).
382    
383    =item data_codepage
384    
385    If your imput data isn't in C<ISO-8859-1> encoding, you will have to specify
386    this option.
387    
388    =item index_codepage
389    
390    If your index encoding is not C<UTF-8> use this option.
391    
392    If you are not using supplied JavaScript search code, or your browser is
393    terribly broken and thinks that index shouldn't be in UTF-8 encoding, use
394    this option to specify encoding for created XML index.
395    
396    =item output_filter
397    
398    B<this is just draft of documentation for option which is not implemented!>
399    
400    Code ref to sub which can do modifications on resulting XML file for node.
401    Encoding of this data will be in L<index_codepage> and you have to take care
402    not to break XML structure. Calling L<xmllint> on your result index
403    (like C<t/90xmllint.t> does in this distribution) is a good idea after using
404    this option.
405    
406    This option is also right place to plug in unaccenting function using
407    L<Text::Unaccent>.
408    
409    =back
410    
411  =cut  =cut
412    
413    my $iconv;
414    my $iconv_l1;
415    
416  sub to_jsfind {  sub to_jsfind {
417          my $self = shift;          my $self = shift;
418    
419          my $path = shift || confess "to_jsfind need path to your index!";          my %arg = @_;
420    
421            confess "to_jsfind need path to your index directory !" unless ($arg{'dir'});
422    
423            my $data_codepage = $arg{'data_codepage'};
424            my $index_codepage = $arg{'index_codepage'} || 'UTF-8';
425    
426          $path .= "/" if ($path =~ /\/$/);          # create ISO-8859-1 iconv for HTML::Entities decode
427          carp "create directory for index '$path': $!" if (! -w $path);          $iconv_l1 = Text::Iconv->new('ISO-8859-1',$index_codepage);
428    
429          return $self->root->to_jsfind($path,"0");          # create another iconv for data
430            if ($data_codepage && $index_codepage) {
431                    $iconv = Text::Iconv->new($data_codepage,$index_codepage);
432            }
433    
434            return $self->root->to_jsfind($arg{'dir'},"0");
435  }  }
436    
437    
# Line 324  sub default_cmp { Line 440  sub default_cmp {
440    $_[0] cmp $_[1];    $_[0] cmp $_[1];
441  }  }
442    
443    =head2 _recode
444    
445    This is internal function to recode charset.
446    
447    It will also try to decode entities in data using L<HTML::Entities>.
448    
449    =cut
450    
451    sub _recode {
452            my $self = shift;
453            my $text = shift || return;
454    
455            sub _decode_html_entities {
456                    my $data = shift || return;
457                    $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
458            }
459    
460            if ($iconv) {
461                    $text = $iconv->convert($text) || $text && carp "convert problem: $text";
462                    $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
463            }
464    
465            return $text;
466    }
467    
468  #####################################################################  #####################################################################
469    
470  =head2 jsFind::Node methods  =head1 jsFind::Node methods
471    
472  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and  Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
473  each has C<k+1> subnodes, which might be null.  each has C<k+1> subnodes, which might be null.
# Line 348  use strict; Line 489  use strict;
489    
490  use Carp;  use Carp;
491  use File::Path;  use File::Path;
492    use Text::Iconv;
493    use POSIX;
494    
495    use base 'jsFind';
496    
497  my $KEYS = 0;  my $KEYS = 0;
498  my $DATA = 1;  my $DATA = 1;
499  my $SUBNODES = 2;  my $SUBNODES = 2;
500    
501  =head3 new  =head2 new
502    
503  Create New node  Create New node
504    
# Line 373  sub new { Line 518  sub new {
518    bless [@_] => $package;    bless [@_] => $package;
519  }  }
520    
521  =head3 locate_key  =head2 locate_key
522    
523  Locate key in node using linear search. This should probably be replaced  Locate key in node using linear search. This should probably be replaced
524  by binary search for better performance.  by binary search for better performance.
# Line 410  sub locate_key { Line 555  sub locate_key {
555  }  }
556    
557    
558  =head3 emptynode  =head2 emptynode
559    
560  Creates new empty node  Creates new empty node
561    
# Line 423  sub emptynode { Line 568  sub emptynode {
568    new($_[0]);                   # Pass package name, but not anything else.    new($_[0]);                   # Pass package name, but not anything else.
569  }  }
570    
571  =head3 is_empty  =head2 is_empty
572    
573  Test if node is empty  Test if node is empty
574    
# Line 437  sub is_empty { Line 582  sub is_empty {
582    !defined($self) || $#$self < 0;    !defined($self) || $#$self < 0;
583  }  }
584    
585  =head3 key  =head2 key
586    
587  Return C<$i>th key from node  Return C<$i>th key from node
588    
# Line 453  sub key { Line 598  sub key {
598     $_[0]->[$KEYS][$_[1]];     $_[0]->[$KEYS][$_[1]];
599  }  }
600    
601  =head3 data  =head2 data
602    
603  Return C<$i>th data from node  Return C<$i>th data from node
604    
# Line 466  sub data { Line 611  sub data {
611    $self->[$DATA][$n];    $self->[$DATA][$n];
612  }  }
613    
614  =head3 kdp_replace  =head2 kdp_replace
615    
616  Set key data pair for C<$i>th element in node  Set key data pair for C<$i>th element in node
617    
# Line 487  sub kdp_replace { Line 632  sub kdp_replace {
632     $self->[$DATA][$n]];     $self->[$DATA][$n]];
633  }  }
634    
635  =head3 kdp_insert  =head2 kdp_insert
636    
637   # No return value.  Insert key/data pair in tree
638    
639      $node->kdp_insert("key value" => "data value");
640    
641    No return value.
642    
643  =cut  =cut
644    
# Line 508  sub kdp_insert { Line 657  sub kdp_insert {
657    splice(@{$self->[$SUBNODES]}, $where, 0, undef);    splice(@{$self->[$SUBNODES]}, $where, 0, undef);
658  }  }
659    
660  =head3 kdp_append  =head2 kdp_append
661    
662  Adds new data keys and values to C<$i>th element in node  Adds new data keys and values to C<$i>th element in node
663    
# Line 529  sub kdp_append { Line 678  sub kdp_append {
678     $self->[$DATA][$n]];     $self->[$DATA][$n]];
679  }  }
680    
681  =head3 subnode  =head2 subnode
682    
683  Set new or return existing subnode  Set new or return existing subnode
684    
# Line 547  sub subnode { Line 696  sub subnode {
696    $self->[$SUBNODES][$n];    $self->[$SUBNODES][$n];
697  }  }
698    
699  =head3 is_leaf  =head2 is_leaf
700    
701  Test if node is leaf  Test if node is leaf
702    
# Line 560  sub is_leaf { Line 709  sub is_leaf {
709    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.    ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
710  }  }
711    
712  =head3 size  =head2 size
713    
714  Return number of keys in the node  Return number of keys in the node
715    
# Line 573  sub size { Line 722  sub size {
722    return scalar(@{$self->[$KEYS]});    return scalar(@{$self->[$KEYS]});
723  }  }
724    
725  =head3 halves  =head2 halves
726    
727    Split node into two halves so that keys C<0 .. $n-1> are in one node
728    and keys C<$n+1 ... $size> are in the other.
729    
730   # Accept an index $n    my ($left_node, $right_node, $kdp) = $node->halves($n);
  # Divide into two nodes so that keys 0 .. $n-1 are in one node  
  # and keys $n+1 ... $size are in the other.  
731    
732  =cut  =cut
733    
# Line 601  sub halves { Line 751  sub halves {
751    ($self->new(@left), $self->new(@right), \@middle);    ($self->new(@left), $self->new(@right), \@middle);
752  }  }
753    
754  =head3 to_string  =head2 to_string
755    
756  Dumps tree as string  Dumps tree as string
757    
# Line 652  sub to_string { Line 802  sub to_string {
802    
803  =end comment  =end comment
804    
805  =head3 to_dot  =head2 to_dot
806    
807  Recursivly walk nodes of tree  Recursivly walk nodes of tree
808    
# Line 691  sub to_dot { Line 841  sub to_dot {
841          $dot;          $dot;
842  }  }
843    
844  =head3 to_jsfind  =head2 to_xml
845    
846    Escape <, >, & and ", and to produce valid XML
847    
848    =cut
849    
850    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
851    my $escape_re  = join '|' => keys %escape;
852    
853    sub to_xml {
854            my $self = shift || confess "you should call to_xml as object!";
855    
856            my $d = shift || return;
857            $d = $self->SUPER::_recode($d);
858            confess "escape_re undefined!" unless ($escape_re);
859            $d =~ s/($escape_re)/$escape{$1}/g;
860            return $d;
861    }
862    
863    =head2 base_x
864    
865    Convert number to base x (used for jsFind index filenames).
866    
867     my $n = $tree->base_x(50);
868    
869    =cut
870    
871    sub base_x {
872            my $self = shift;
873    
874            my $value = shift;
875    
876            confess("need non-negative number") if (! defined($value) || $value < 0);
877    
878            my @digits = qw(
879                    0 1 2 3 4 5 6 7 8 9
880                    a b c d e f g h i j k l m n o p q r s t u v w x y z
881            );
882    
883            my $base = scalar(@digits);
884            my $out = "";
885            my $pow = 1;
886            my $pos = 0;
887    
888    
889            if($value == 0) {
890                    return "0";
891            }
892    
893            while($value > 0) {
894                    $pos = $value % $base;
895                    $out = $digits[$pos] . $out;
896                    $value = floor($value/$base);
897                    $pow *= $base;
898            }
899    
900            return $out;
901    }
902    
903    =head2 to_jsfind
904    
905  Create jsFind xml files  Create jsFind xml files
906    
# Line 710  sub to_jsfind { Line 919  sub to_jsfind {
919          confess("path is undefined.") unless ($path);          confess("path is undefined.") unless ($path);
920          confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));          confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
921    
922            $file = $self->base_x($file);
923    
924          my $nr_keys = 0;          my $nr_keys = 0;
925    
926          my ($k, $d, $s) = @$self;          my ($k, $d, $s) = @$self;
# Line 721  sub to_jsfind { Line 932  sub to_jsfind {
932                  my $key = lc($k->[$i]);                  my $key = lc($k->[$i]);
933    
934                  if ($key) {                  if ($key) {
935                          $key_xml .= qq{<k>$key</k>};                          $key_xml .= '<k>'.$self->to_xml($key).'</k>';
936                          $data_xml .= qq{<e>};                          $data_xml .= '<e>';
937          #use Data::Dumper;          #use Data::Dumper;
938          #print Dumper($d->[$i]);          #print Dumper($d->[$i]);
939                          foreach my $path (keys %{$d->[$i]}) {                          foreach my $path (keys %{$d->[$i]}) {
940                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';                                  $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
941                                  $nr_keys++;                                  $nr_keys++;
942                          }                          }
943                          $data_xml .= qq{</e>};                          $data_xml .= '</e>';
944                  }                  }
945    
946                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);                  $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
947          }          }
948    
949          $key_xml .= "</n>";          $key_xml .= '</n>';
950          $data_xml .= "</d>";          $data_xml .= '</d>';
951    
952          if (! -e $path) {          if (! -e $path) {
953                  mkpath($path) || croak "can't create dir '$path': $!";                  mkpath($path) || croak "can't create dir '$path': $!";
# Line 763  jsFind web site L<http://www.elucidsoft. Line 974  jsFind web site L<http://www.elucidsoft.
974    
975  B-Trees in perl web site L<http://perl.plover.com/BTree/>  B-Trees in perl web site L<http://perl.plover.com/BTree/>
976    
977    This module web site L<http://www.rot13.org/~dpavlin/jsFind.html>
978    
979  =head1 AUTHORS  =head1 AUTHORS
980    
981  Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was  Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was

Legend:
Removed from v.8  
changed lines
  Added in v.36

  ViewVC Help
Powered by ViewVC 1.1.26