/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (hide annotations)
Tue Jul 20 17:47:30 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 15614 byte(s)
B_search documentation, updated MANIFEST

1 dpavlin 1 package jsFind;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.01';
8    
9     =head1 NAME
10    
11     jsFind - generate index for jsFind using B-Tree
12    
13     =head1 SYNOPSIS
14    
15     use jsFind;
16    
17    
18    
19     =head1 DESCRIPTION
20    
21     This module can be used to create index files for jsFind, powerful tool for
22     adding a search engine to a CDROM archive or catalog without requiring the
23     user to install anything.
24    
25     Main difference between this module and scripts delivered with jsFind are:
26    
27     =over 5
28    
29     =item *
30    
31     You don't need to use swish-e to create index
32    
33     =item *
34    
35     You can programatically (and incrementaly) create index for jsFind
36    
37     =back
38    
39     =head1 METHODS
40    
41     This module contains two packages C<jsFind> and C<jsFind::Node>.
42    
43     =head2 jsFind methods
44    
45     =cut
46    
47     use Exporter 'import';
48     use Carp;
49    
50     our @ISA = qw(Exporter);
51    
52     BEGIN {
53     import 'jsFind::Node';
54     }
55    
56     =head3 new
57    
58     Create new tree. Arguments are C<B> which is maximum numbers of keys in
59     each node and optional C<Root> node. Each root node may have child nodes.
60    
61     All nodes are objects from C<jsFind::Node>.
62    
63     my $t = new jsFind(B => 4);
64    
65     =cut
66    
67     my $DEBUG = 1;
68    
69     sub new {
70     my $package = shift;
71     my %ARGV = @_;
72     croak "Usage: {$package}::new(B => number [, Root => root node ])"
73     unless exists $ARGV{B};
74     if ($ARGV{B} % 2) {
75     my $B = $ARGV{B} + 1;
76     carp "B must be an even number. Using $B instead.";
77     $ARGV{B} = $B;
78     }
79    
80     my $B = $ARGV{B};
81     my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
82     bless { B => $B, Root => $Root } => $package;
83     }
84    
85     =head3 B_search
86    
87     Search, insert, append or replace data in B-Tree
88    
89 dpavlin 5 $t->B_search(
90     Key => 'key value',
91     Data => { "path" => {
92     "t" => "title of document",
93     "f" => 99,
94     },
95     Insert => 1,
96     Append => 1,
97     );
98 dpavlin 1
99     Semantics:
100    
101     If key not found, insert it iff C<Insert> argument is present.
102    
103     If key B<is> found, replace existing data iff C<Replace> argument
104     is present or add new datum to existing iff C<Append> argument is present.
105    
106     =cut
107    
108     sub B_search {
109     my $self = shift;
110     my %args = @_;
111     my $cur_node = $self->root;
112     my $k = $args{Key};
113     my $d = $args{Data};
114     my @path;
115    
116     if ($cur_node->is_empty) { # Special case for empty root
117     if ($args{Insert}) {
118     $cur_node->kdp_insert($k => $d);
119     return $d;
120     } else {
121     return undef;
122     }
123     }
124    
125     # Descend tree to leaf
126     for (;;) {
127    
128     # Didn't hit bottom yet.
129    
130     my($there, $where) = $cur_node->locate_key($k);
131     if ($there) { # Found it!
132     if ($args{Replace}) {
133     $cur_node->kdp_replace($where, $k => $d);
134     } elsif ($args{Append}) {
135     $cur_node->kdp_append($where, $k => $d);
136     }
137     return $cur_node->data($where);
138     }
139    
140     # Not here---must be in a subtree.
141    
142     if ($cur_node->is_leaf) { # But there are no subtrees
143     return undef unless $args{Insert}; # Search failed
144     # Stuff it in
145     $cur_node->kdp_insert($k => $d);
146     if ($self->node_overfull($cur_node)) { # Oops--there was no room.
147     $self->split_and_promote($cur_node, @path);
148     }
149     return $d;
150     }
151    
152     # There are subtrees, and the key is in one of them.
153    
154     push @path, [$cur_node, $where]; # Record path from root.
155    
156     # Move down to search the subtree
157     $cur_node = $cur_node->subnode($where);
158    
159     # and start over.
160     } # for (;;) ...
161    
162     croak ("How did I get here?");
163     }
164    
165    
166    
167     sub split_and_promote_old {
168     my $self = shift;
169     my ($cur_node, @path) = @_;
170    
171     for (;;) {
172     my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
173     my ($up, $where) = @{pop @path};
174     if ($up) {
175     $up->kdp_insert(@$kdp);
176     my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
177     croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
178     unless $tthere;
179     croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
180     unless $twhere == $where;
181     $up->subnode($where, $newleft);
182     $up->subnode($where+1, $newright);
183     return unless $self->node_overfull($up);
184     $cur_node = $up;
185     } else { # We're at the top; make a new root.
186     my $newroot = new jsFind::Node ([$kdp->[0]],
187     [$kdp->[1]],
188     [$newleft, $newright]);
189     $self->root($newroot);
190     return;
191     }
192     }
193    
194     }
195    
196     sub split_and_promote {
197     my $self = shift;
198     my ($cur_node, @path) = @_;
199    
200     for (;;) {
201     my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
202     my ($up, $where) = @{pop @path} if (@path);
203     if ($up) {
204     $up->kdp_insert(@$kdp);
205     if ($DEBUG) {
206     my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
207     croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
208     unless $tthere;
209     croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
210     unless $twhere == $where;
211     }
212     $up->subnode($where, $newleft);
213     $up->subnode($where+1, $newright);
214     return unless $self->node_overfull($up);
215     $cur_node = $up;
216     } else { # We're at the top; make a new root.
217     my $newroot = new jsFind::Node([$kdp->[0]],
218     [$kdp->[1]],
219     [$newleft, $newright]);
220     $self->root($newroot);
221     return;
222     }
223     }
224     }
225    
226     =head3 B
227    
228     Return B (maximum number of keys)
229    
230     my $max_size = $t->B;
231    
232     =cut
233    
234     sub B {
235     $_[0]{B};
236     }
237    
238     =head3 root
239    
240     Returns root node
241    
242     my $root = $t->root;
243    
244     =cut
245    
246     sub root {
247     my ($self, $newroot) = @_;
248     $self->{Root} = $newroot if defined $newroot;
249     $self->{Root};
250     }
251    
252     =head3 node_overfull
253    
254     Returns if node is overfull
255    
256     if ($node->node_overfull) { something }
257    
258     =cut
259    
260     sub node_overfull {
261     my $self = shift;
262     my $node = shift;
263     $node->size > $self->B;
264     }
265    
266     =head3 to_string
267    
268     Returns your tree as formatted string.
269    
270     my $text = $root->to_string;
271    
272     Mostly usefull for debugging as output leaves much to be desired.
273    
274     =cut
275    
276     sub to_string {
277     $_[0]->root->to_string;
278     }
279    
280     =head3 to_dot
281    
282     Create Graphviz graph of your tree
283    
284     my $dot_graph = $root->to_dot;
285    
286     =cut
287    
288     sub to_dot {
289     my $self = shift;
290    
291     my $dot = qq/digraph dns {\nrankdir=LR;\n/;
292     $dot .= $self->root->to_dot;
293     $dot .= qq/\n}\n/;
294    
295     return $dot;
296     }
297    
298     =head3 to_jsfind
299    
300     Create xml index files for jsFind. This should be called after
301     your B-Tree has been filled with data.
302    
303     $root->to_jsfind('/full/path/to/index/dir/');
304    
305     Returns number of nodes in created tree.
306    
307     =cut
308    
309     sub to_jsfind {
310     my $self = shift;
311    
312     my $path = shift || confess "to_jsfind need path to your index!";
313    
314     $path .= "/" if ($path =~ /\/$/);
315     carp "can't create index in '$path': $!" if (! -w $path);
316    
317     return $self->root->to_jsfind($path,"0");
318     }
319    
320    
321     # private, default cmd function
322     sub default_cmp {
323     $_[0] cmp $_[1];
324     }
325    
326     #####################################################################
327    
328     =head2 jsFind::Node methods
329    
330     Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
331     each has C<k+1> subnodes, which might be null.
332    
333     The node is a blessed reference to a list with three elements:
334    
335     ($keylist, $datalist, $subnodelist)
336    
337     each is a reference to a list list.
338    
339     The null node is represented by a blessed reference to an empty list.
340    
341     =cut
342    
343     package jsFind::Node;
344    
345     use warnings;
346     use strict;
347    
348     use Carp;
349     use File::Path;
350    
351     my $KEYS = 0;
352     my $DATA = 1;
353     my $SUBNODES = 2;
354    
355     =head3 new
356    
357     Create New node
358    
359     my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
360    
361     You can also mit argument list to create empty node.
362    
363     my $empty_node = new jsFind::Node;
364    
365     =cut
366    
367     sub new {
368     my $self = shift;
369     my $package = ref $self || $self;
370     croak "Internal error: jsFind::Node::new called with wrong number of arguments."
371     unless @_ == 3 || @_ == 0;
372     bless [@_] => $package;
373     }
374    
375     =head3 locate_key
376    
377     Locate key in node using linear search. This should probably be replaced
378     by binary search for better performance.
379    
380     my ($found, $index) = $node->locate_key($key, $cmp_coderef);
381    
382     Argument C<$cmp_coderef> is optional reference to custom comparison
383     operator.
384    
385     Returns (1, $index) if $key[$index] eq $key.
386    
387     Returns (0, $index) if key could be found in $subnode[$index].
388    
389     In scalar context, just returns 1 or 0.
390    
391     =cut
392    
393     sub locate_key {
394     # Use linear search for testing, replace with binary search.
395     my $self = shift;
396     my $key = shift;
397     my $cmp = shift || \&jsFind::default_cmp;
398     my $i;
399     my $cmp_result;
400     my $N = $self->size;
401     for ($i = 0; $i < $N; $i++) {
402     $cmp_result = &$cmp($key, $self->key($i));
403     last if $cmp_result <= 0;
404     }
405    
406     # $i is now the index of the first node-key greater than $key
407     # or $N if there is no such. $cmp_result is 0 iff the key was found.
408     (!$cmp_result, $i);
409     }
410    
411    
412     =head3 emptynode
413    
414     Creates new empty node
415    
416     $node = $root->emptynode;
417     $new_node = $node->emptynode;
418    
419     =cut
420    
421     sub emptynode {
422     new($_[0]); # Pass package name, but not anything else.
423     }
424    
425     =head3 is_empty
426    
427     Test if node is empty
428    
429     if ($node->is_empty) { something }
430    
431     =cut
432    
433     # undef is empty; so is a blessed empty list.
434     sub is_empty {
435     my $self = shift;
436     !defined($self) || $#$self < 0;
437     }
438    
439     =head3 key
440    
441     Return C<$i>th key from node
442    
443     my $key = $node->key($i);
444    
445     =cut
446    
447     sub key {
448     # my ($self, $n) = @_;
449     # $self->[$KEYS][$n];
450    
451     # speedup
452     $_[0]->[$KEYS][$_[1]];
453     }
454    
455     =head3 data
456    
457     Return C<$i>th data from node
458    
459     my $data = $node->data($i);
460    
461     =cut
462    
463     sub data {
464     my ($self, $n) = @_;
465     $self->[$DATA][$n];
466     }
467    
468     =head3 kdp_replace
469    
470     Set key data pair for C<$i>th element in node
471    
472     $node->kdp_replace($i, "key value" => {
473     "data key 1" => "data value 1",
474     "data key 2" => "data value 2",
475     };
476    
477     =cut
478    
479     sub kdp_replace {
480     my ($self, $n, $k => $d) = @_;
481     if (defined $k) {
482     $self->[$KEYS][$n] = $k;
483     $self->[$DATA][$n] = $d;
484     }
485     [$self->[$KEYS][$n],
486     $self->[$DATA][$n]];
487     }
488    
489     =head3 kdp_insert
490    
491     # No return value.
492    
493     =cut
494    
495     sub kdp_insert {
496     my $self = shift;
497     my ($k => $d) = @_;
498     my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
499    
500     if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
501    
502     # undef fix
503     $where ||= 0;
504    
505     splice(@{$self->[$KEYS]}, $where, 0, $k);
506     splice(@{$self->[$DATA]}, $where, 0, $d);
507     splice(@{$self->[$SUBNODES]}, $where, 0, undef);
508     }
509    
510     =head3 kdp_append
511    
512     Adds new data keys and values to C<$i>th element in node
513    
514     $node->kdp_append($i, "key value" => {
515     "added data key" => "added data value",
516     };
517    
518     =cut
519    
520     sub kdp_append {
521     my ($self, $n, $k => $d) = @_;
522     if (defined $k) {
523     $self->[$KEYS][$n] = $k;
524     my ($kv,$dv) = %{$d};
525     $self->[$DATA][$n]->{$kv} = $dv;
526     }
527     [$self->[$KEYS][$n],
528     $self->[$DATA][$n]];
529     }
530    
531     =head3 subnode
532    
533     Set new or return existing subnode
534    
535     # return 4th subnode
536     my $my_node = $node->subnode(4);
537    
538     # create new subnode 5 from $my_node
539     $node->subnode(5, $my_node);
540    
541     =cut
542    
543     sub subnode {
544     my ($self, $n, $newnode) = @_;
545     $self->[$SUBNODES][$n] = $newnode if defined $newnode;
546     $self->[$SUBNODES][$n];
547     }
548    
549     =head3 is_leaf
550    
551     Test if node is leaf
552    
553     if ($node->is_leaf) { something }
554    
555     =cut
556    
557     sub is_leaf {
558     my $self = shift;
559     ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
560     }
561    
562     =head3 size
563    
564     Return number of keys in the node
565    
566     my $nr = $node->size;
567    
568     =cut
569    
570     sub size {
571     my $self = shift;
572     return scalar(@{$self->[$KEYS]});
573     }
574    
575     =head3 halves
576    
577     # Accept an index $n
578     # Divide into two nodes so that keys 0 .. $n-1 are in one node
579     # and keys $n+1 ... $size are in the other.
580    
581     =cut
582    
583     sub halves {
584     my $self = shift;
585     my $n = shift;
586     my $s = $self->size;
587     my @right;
588     my @left;
589    
590     $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
591     $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
592     $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
593    
594     $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
595     $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
596     $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
597    
598     my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
599    
600     ($self->new(@left), $self->new(@right), \@middle);
601     }
602    
603     =head3 to_string
604    
605     Dumps tree as string
606    
607     my $str = $root->to_string;
608    
609     =cut
610    
611     sub to_string {
612     my $self = shift;
613     my $indent = shift || 0;
614     my $I = ' ' x $indent;
615     return '' if $self->is_empty;
616     my ($k, $d, $s) = @$self;
617     my $result = '';
618     $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
619     my $N = $self->size;
620     my $i;
621     for ($i = 0; $i < $N; $i++) {
622     # $result .= $I . "$k->[$i] => $d->[$i]\n";
623     $result .= $I . "$k->[$i]\n";
624     $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
625     }
626     $result;
627     }
628    
629     =begin comment
630    
631     use Data::Dumper;
632    
633     sub to_string {
634     my $self = shift;
635     my $indent = shift || 0;
636     my $path = shift || '0';
637     return '' if $self->is_empty;
638     my ($k, $d, $s) = @$self;
639     my $result = '';
640     $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
641     my $N = $self->size;
642     for (my $i = 0; $i < $N; $i++) {
643     my $dump = Dumper($d->[$i]);
644     $dump =~ s/[\n\r\s]+/ /gs;
645     $dump =~ s/\$VAR1\s*=\s*//;
646     $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
647     $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
648     }
649     $result;
650     }
651    
652     =end comment
653    
654     =head3 to_dot
655    
656     Recursivly walk nodes of tree
657    
658     =cut
659    
660     sub to_dot {
661     my $self = shift;
662     my $parent = shift;
663    
664     return '' if $self->is_empty;
665    
666     my $dot = '';
667    
668     my ($k, $d, $s) = @$self;
669     my $N = $self->size;
670    
671     my @dot_keys;
672    
673     my $node_name = $parent || '_';
674     $node_name =~ s/\W+//g;
675     $node_name .= " [$N]";
676    
677     for (my $i = 0; $i <= $N; $i++) {
678     if (my $key = $k->[$i]) {
679     push @dot_keys, qq{<$i>$key};
680     }
681     $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
682     }
683     push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
684    
685     my $label = join("|",@dot_keys);
686     $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
687    
688     $dot .= qq{$parent -> "$node_name";\n} if ($parent);
689    
690     $dot;
691     }
692    
693     =head3 to_jsfind
694    
695     Create jsFind xml files
696    
697     my $nr=$tree->to_dot('/path/to/index','0');
698    
699     Returns number of elements created
700    
701     =cut
702    
703     sub to_jsfind {
704     my $self = shift;
705     my ($path,$file) = @_;
706    
707     return 0 if $self->is_empty;
708    
709     my $nr_keys = 0;
710    
711     my ($k, $d, $s) = @$self;
712     my $N = $self->size;
713    
714     my ($key_xml, $data_xml) = ("<n>","<d>");
715    
716     for (my $i = 0; $i <= $N; $i++) {
717     my $key = lc($k->[$i]);
718    
719     if ($key) {
720     $key_xml .= qq{<k>$key</k>};
721     $data_xml .= qq{<e>};
722     #use Data::Dumper;
723     #print Dumper($d->[$i]);
724     foreach my $path (keys %{$d->[$i]}) {
725     $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';
726     $nr_keys++;
727     }
728     $data_xml .= qq{</e>};
729     }
730    
731     $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
732     }
733    
734     $key_xml .= "</n>";
735     $data_xml .= "</d>";
736    
737     if (! -e $path) {
738     mkpath($path) || croak "can't create dir '$path': $!";
739     }
740    
741     open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
742     open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
743    
744     print K $key_xml;
745     print D $data_xml;
746    
747     close(K);
748     close(D);
749    
750     return $nr_keys;
751     }
752    
753     1;
754     __END__
755    
756     =head1 SEE ALSO
757    
758     jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
759    
760     B-Trees in perl web site L<http://perl.plover.com/BTree/>
761    
762     =head1 AUTHORS
763    
764     Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
765     base for this module
766    
767     Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
768    
769     Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
770    
771     =head1 COPYRIGHT AND LICENSE
772    
773     Copyright (C) 2004 by Dobrica Pavlinusic
774    
775     This program is free software; you can redistribute it and/or modify it
776     under the terms of the GNU General Public License as published by the Free
777     Software Foundation; either version 2 of the License, or (at your option)
778     any later version. This program is distributed in the hope that it will be
779     useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
780     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
781     Public License for more details.
782    
783     =cut

  ViewVC Help
Powered by ViewVC 1.1.26