/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Sun Jul 11 20:18:25 2004 UTC (19 years, 10 months ago) by dpavlin
File size: 15474 byte(s)
initial import into subversion of version 0.1

1 dpavlin 1 package jsFind;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.01';
8    
9     =head1 NAME
10    
11     jsFind - generate index for jsFind using B-Tree
12    
13     =head1 SYNOPSIS
14    
15     use jsFind;
16    
17    
18    
19     =head1 DESCRIPTION
20    
21     This module can be used to create index files for jsFind, powerful tool for
22     adding a search engine to a CDROM archive or catalog without requiring the
23     user to install anything.
24    
25     Main difference between this module and scripts delivered with jsFind are:
26    
27     =over 5
28    
29     =item *
30    
31     You don't need to use swish-e to create index
32    
33     =item *
34    
35     You can programatically (and incrementaly) create index for jsFind
36    
37     =back
38    
39     =head1 METHODS
40    
41     This module contains two packages C<jsFind> and C<jsFind::Node>.
42    
43     =head2 jsFind methods
44    
45     =cut
46    
47     use Exporter 'import';
48     use Carp;
49    
50     our @ISA = qw(Exporter);
51    
52     BEGIN {
53     import 'jsFind::Node';
54     }
55    
56     =head3 new
57    
58     Create new tree. Arguments are C<B> which is maximum numbers of keys in
59     each node and optional C<Root> node. Each root node may have child nodes.
60    
61     All nodes are objects from C<jsFind::Node>.
62    
63     my $t = new jsFind(B => 4);
64    
65     =cut
66    
67     my $DEBUG = 1;
68    
69     sub new {
70     my $package = shift;
71     my %ARGV = @_;
72     croak "Usage: {$package}::new(B => number [, Root => root node ])"
73     unless exists $ARGV{B};
74     if ($ARGV{B} % 2) {
75     my $B = $ARGV{B} + 1;
76     carp "B must be an even number. Using $B instead.";
77     $ARGV{B} = $B;
78     }
79    
80     my $B = $ARGV{B};
81     my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
82     bless { B => $B, Root => $Root } => $package;
83     }
84    
85     =head3 B_search
86    
87     Search, insert, append or replace data in B-Tree
88    
89    
90    
91     Semantics:
92    
93     If key not found, insert it iff C<Insert> argument is present.
94    
95     If key B<is> found, replace existing data iff C<Replace> argument
96     is present or add new datum to existing iff C<Append> argument is present.
97    
98     =cut
99    
100     sub B_search {
101     my $self = shift;
102     my %args = @_;
103     my $cur_node = $self->root;
104     my $k = $args{Key};
105     my $d = $args{Data};
106     my @path;
107    
108     if ($cur_node->is_empty) { # Special case for empty root
109     if ($args{Insert}) {
110     $cur_node->kdp_insert($k => $d);
111     return $d;
112     } else {
113     return undef;
114     }
115     }
116    
117     # Descend tree to leaf
118     for (;;) {
119    
120     # Didn't hit bottom yet.
121    
122     my($there, $where) = $cur_node->locate_key($k);
123     if ($there) { # Found it!
124     if ($args{Replace}) {
125     $cur_node->kdp_replace($where, $k => $d);
126     } elsif ($args{Append}) {
127     $cur_node->kdp_append($where, $k => $d);
128     }
129     return $cur_node->data($where);
130     }
131    
132     # Not here---must be in a subtree.
133    
134     if ($cur_node->is_leaf) { # But there are no subtrees
135     return undef unless $args{Insert}; # Search failed
136     # Stuff it in
137     $cur_node->kdp_insert($k => $d);
138     if ($self->node_overfull($cur_node)) { # Oops--there was no room.
139     $self->split_and_promote($cur_node, @path);
140     }
141     return $d;
142     }
143    
144     # There are subtrees, and the key is in one of them.
145    
146     push @path, [$cur_node, $where]; # Record path from root.
147    
148     # Move down to search the subtree
149     $cur_node = $cur_node->subnode($where);
150    
151     # and start over.
152     } # for (;;) ...
153    
154     croak ("How did I get here?");
155     }
156    
157    
158    
159     sub split_and_promote_old {
160     my $self = shift;
161     my ($cur_node, @path) = @_;
162    
163     for (;;) {
164     my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
165     my ($up, $where) = @{pop @path};
166     if ($up) {
167     $up->kdp_insert(@$kdp);
168     my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
169     croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
170     unless $tthere;
171     croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
172     unless $twhere == $where;
173     $up->subnode($where, $newleft);
174     $up->subnode($where+1, $newright);
175     return unless $self->node_overfull($up);
176     $cur_node = $up;
177     } else { # We're at the top; make a new root.
178     my $newroot = new jsFind::Node ([$kdp->[0]],
179     [$kdp->[1]],
180     [$newleft, $newright]);
181     $self->root($newroot);
182     return;
183     }
184     }
185    
186     }
187    
188     sub split_and_promote {
189     my $self = shift;
190     my ($cur_node, @path) = @_;
191    
192     for (;;) {
193     my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
194     my ($up, $where) = @{pop @path} if (@path);
195     if ($up) {
196     $up->kdp_insert(@$kdp);
197     if ($DEBUG) {
198     my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
199     croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
200     unless $tthere;
201     croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
202     unless $twhere == $where;
203     }
204     $up->subnode($where, $newleft);
205     $up->subnode($where+1, $newright);
206     return unless $self->node_overfull($up);
207     $cur_node = $up;
208     } else { # We're at the top; make a new root.
209     my $newroot = new jsFind::Node([$kdp->[0]],
210     [$kdp->[1]],
211     [$newleft, $newright]);
212     $self->root($newroot);
213     return;
214     }
215     }
216     }
217    
218     =head3 B
219    
220     Return B (maximum number of keys)
221    
222     my $max_size = $t->B;
223    
224     =cut
225    
226     sub B {
227     $_[0]{B};
228     }
229    
230     =head3 root
231    
232     Returns root node
233    
234     my $root = $t->root;
235    
236     =cut
237    
238     sub root {
239     my ($self, $newroot) = @_;
240     $self->{Root} = $newroot if defined $newroot;
241     $self->{Root};
242     }
243    
244     =head3 node_overfull
245    
246     Returns if node is overfull
247    
248     if ($node->node_overfull) { something }
249    
250     =cut
251    
252     sub node_overfull {
253     my $self = shift;
254     my $node = shift;
255     $node->size > $self->B;
256     }
257    
258     =head3 to_string
259    
260     Returns your tree as formatted string.
261    
262     my $text = $root->to_string;
263    
264     Mostly usefull for debugging as output leaves much to be desired.
265    
266     =cut
267    
268     sub to_string {
269     $_[0]->root->to_string;
270     }
271    
272     =head3 to_dot
273    
274     Create Graphviz graph of your tree
275    
276     my $dot_graph = $root->to_dot;
277    
278     =cut
279    
280     sub to_dot {
281     my $self = shift;
282    
283     my $dot = qq/digraph dns {\nrankdir=LR;\n/;
284     $dot .= $self->root->to_dot;
285     $dot .= qq/\n}\n/;
286    
287     return $dot;
288     }
289    
290     =head3 to_jsfind
291    
292     Create xml index files for jsFind. This should be called after
293     your B-Tree has been filled with data.
294    
295     $root->to_jsfind('/full/path/to/index/dir/');
296    
297     Returns number of nodes in created tree.
298    
299     =cut
300    
301     sub to_jsfind {
302     my $self = shift;
303    
304     my $path = shift || confess "to_jsfind need path to your index!";
305    
306     $path .= "/" if ($path =~ /\/$/);
307     carp "can't create index in '$path': $!" if (! -w $path);
308    
309     return $self->root->to_jsfind($path,"0");
310     }
311    
312    
313     # private, default cmd function
314     sub default_cmp {
315     $_[0] cmp $_[1];
316     }
317    
318     #####################################################################
319    
320     =head2 jsFind::Node methods
321    
322     Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
323     each has C<k+1> subnodes, which might be null.
324    
325     The node is a blessed reference to a list with three elements:
326    
327     ($keylist, $datalist, $subnodelist)
328    
329     each is a reference to a list list.
330    
331     The null node is represented by a blessed reference to an empty list.
332    
333     =cut
334    
335     package jsFind::Node;
336    
337     use warnings;
338     use strict;
339    
340     use Carp;
341     use File::Path;
342    
343     my $KEYS = 0;
344     my $DATA = 1;
345     my $SUBNODES = 2;
346    
347     =head3 new
348    
349     Create New node
350    
351     my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
352    
353     You can also mit argument list to create empty node.
354    
355     my $empty_node = new jsFind::Node;
356    
357     =cut
358    
359     sub new {
360     my $self = shift;
361     my $package = ref $self || $self;
362     croak "Internal error: jsFind::Node::new called with wrong number of arguments."
363     unless @_ == 3 || @_ == 0;
364     bless [@_] => $package;
365     }
366    
367     =head3 locate_key
368    
369     Locate key in node using linear search. This should probably be replaced
370     by binary search for better performance.
371    
372     my ($found, $index) = $node->locate_key($key, $cmp_coderef);
373    
374     Argument C<$cmp_coderef> is optional reference to custom comparison
375     operator.
376    
377     Returns (1, $index) if $key[$index] eq $key.
378    
379     Returns (0, $index) if key could be found in $subnode[$index].
380    
381     In scalar context, just returns 1 or 0.
382    
383     =cut
384    
385     sub locate_key {
386     # Use linear search for testing, replace with binary search.
387     my $self = shift;
388     my $key = shift;
389     my $cmp = shift || \&jsFind::default_cmp;
390     my $i;
391     my $cmp_result;
392     my $N = $self->size;
393     for ($i = 0; $i < $N; $i++) {
394     $cmp_result = &$cmp($key, $self->key($i));
395     last if $cmp_result <= 0;
396     }
397    
398     # $i is now the index of the first node-key greater than $key
399     # or $N if there is no such. $cmp_result is 0 iff the key was found.
400     (!$cmp_result, $i);
401     }
402    
403    
404     =head3 emptynode
405    
406     Creates new empty node
407    
408     $node = $root->emptynode;
409     $new_node = $node->emptynode;
410    
411     =cut
412    
413     sub emptynode {
414     new($_[0]); # Pass package name, but not anything else.
415     }
416    
417     =head3 is_empty
418    
419     Test if node is empty
420    
421     if ($node->is_empty) { something }
422    
423     =cut
424    
425     # undef is empty; so is a blessed empty list.
426     sub is_empty {
427     my $self = shift;
428     !defined($self) || $#$self < 0;
429     }
430    
431     =head3 key
432    
433     Return C<$i>th key from node
434    
435     my $key = $node->key($i);
436    
437     =cut
438    
439     sub key {
440     # my ($self, $n) = @_;
441     # $self->[$KEYS][$n];
442    
443     # speedup
444     $_[0]->[$KEYS][$_[1]];
445     }
446    
447     =head3 data
448    
449     Return C<$i>th data from node
450    
451     my $data = $node->data($i);
452    
453     =cut
454    
455     sub data {
456     my ($self, $n) = @_;
457     $self->[$DATA][$n];
458     }
459    
460     =head3 kdp_replace
461    
462     Set key data pair for C<$i>th element in node
463    
464     $node->kdp_replace($i, "key value" => {
465     "data key 1" => "data value 1",
466     "data key 2" => "data value 2",
467     };
468    
469     =cut
470    
471     sub kdp_replace {
472     my ($self, $n, $k => $d) = @_;
473     if (defined $k) {
474     $self->[$KEYS][$n] = $k;
475     $self->[$DATA][$n] = $d;
476     }
477     [$self->[$KEYS][$n],
478     $self->[$DATA][$n]];
479     }
480    
481     =head3 kdp_insert
482    
483     # No return value.
484    
485     =cut
486    
487     sub kdp_insert {
488     my $self = shift;
489     my ($k => $d) = @_;
490     my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
491    
492     if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
493    
494     # undef fix
495     $where ||= 0;
496    
497     splice(@{$self->[$KEYS]}, $where, 0, $k);
498     splice(@{$self->[$DATA]}, $where, 0, $d);
499     splice(@{$self->[$SUBNODES]}, $where, 0, undef);
500     }
501    
502     =head3 kdp_append
503    
504     Adds new data keys and values to C<$i>th element in node
505    
506     $node->kdp_append($i, "key value" => {
507     "added data key" => "added data value",
508     };
509    
510     =cut
511    
512     sub kdp_append {
513     my ($self, $n, $k => $d) = @_;
514     if (defined $k) {
515     $self->[$KEYS][$n] = $k;
516     my ($kv,$dv) = %{$d};
517     $self->[$DATA][$n]->{$kv} = $dv;
518     }
519     [$self->[$KEYS][$n],
520     $self->[$DATA][$n]];
521     }
522    
523     =head3 subnode
524    
525     Set new or return existing subnode
526    
527     # return 4th subnode
528     my $my_node = $node->subnode(4);
529    
530     # create new subnode 5 from $my_node
531     $node->subnode(5, $my_node);
532    
533     =cut
534    
535     sub subnode {
536     my ($self, $n, $newnode) = @_;
537     $self->[$SUBNODES][$n] = $newnode if defined $newnode;
538     $self->[$SUBNODES][$n];
539     }
540    
541     =head3 is_leaf
542    
543     Test if node is leaf
544    
545     if ($node->is_leaf) { something }
546    
547     =cut
548    
549     sub is_leaf {
550     my $self = shift;
551     ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
552     }
553    
554     =head3 size
555    
556     Return number of keys in the node
557    
558     my $nr = $node->size;
559    
560     =cut
561    
562     sub size {
563     my $self = shift;
564     return scalar(@{$self->[$KEYS]});
565     }
566    
567     =head3 halves
568    
569     # Accept an index $n
570     # Divide into two nodes so that keys 0 .. $n-1 are in one node
571     # and keys $n+1 ... $size are in the other.
572    
573     =cut
574    
575     sub halves {
576     my $self = shift;
577     my $n = shift;
578     my $s = $self->size;
579     my @right;
580     my @left;
581    
582     $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
583     $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
584     $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
585    
586     $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
587     $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
588     $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
589    
590     my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
591    
592     ($self->new(@left), $self->new(@right), \@middle);
593     }
594    
595     =head3 to_string
596    
597     Dumps tree as string
598    
599     my $str = $root->to_string;
600    
601     =cut
602    
603     sub to_string {
604     my $self = shift;
605     my $indent = shift || 0;
606     my $I = ' ' x $indent;
607     return '' if $self->is_empty;
608     my ($k, $d, $s) = @$self;
609     my $result = '';
610     $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
611     my $N = $self->size;
612     my $i;
613     for ($i = 0; $i < $N; $i++) {
614     # $result .= $I . "$k->[$i] => $d->[$i]\n";
615     $result .= $I . "$k->[$i]\n";
616     $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
617     }
618     $result;
619     }
620    
621     =begin comment
622    
623     use Data::Dumper;
624    
625     sub to_string {
626     my $self = shift;
627     my $indent = shift || 0;
628     my $path = shift || '0';
629     return '' if $self->is_empty;
630     my ($k, $d, $s) = @$self;
631     my $result = '';
632     $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
633     my $N = $self->size;
634     for (my $i = 0; $i < $N; $i++) {
635     my $dump = Dumper($d->[$i]);
636     $dump =~ s/[\n\r\s]+/ /gs;
637     $dump =~ s/\$VAR1\s*=\s*//;
638     $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
639     $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
640     }
641     $result;
642     }
643    
644     =end comment
645    
646     =head3 to_dot
647    
648     Recursivly walk nodes of tree
649    
650     =cut
651    
652     sub to_dot {
653     my $self = shift;
654     my $parent = shift;
655    
656     return '' if $self->is_empty;
657    
658     my $dot = '';
659    
660     my ($k, $d, $s) = @$self;
661     my $N = $self->size;
662    
663     my @dot_keys;
664    
665     my $node_name = $parent || '_';
666     $node_name =~ s/\W+//g;
667     $node_name .= " [$N]";
668    
669     for (my $i = 0; $i <= $N; $i++) {
670     if (my $key = $k->[$i]) {
671     push @dot_keys, qq{<$i>$key};
672     }
673     $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
674     }
675     push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
676    
677     my $label = join("|",@dot_keys);
678     $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
679    
680     $dot .= qq{$parent -> "$node_name";\n} if ($parent);
681    
682     $dot;
683     }
684    
685     =head3 to_jsfind
686    
687     Create jsFind xml files
688    
689     my $nr=$tree->to_dot('/path/to/index','0');
690    
691     Returns number of elements created
692    
693     =cut
694    
695     sub to_jsfind {
696     my $self = shift;
697     my ($path,$file) = @_;
698    
699     return 0 if $self->is_empty;
700    
701     my $nr_keys = 0;
702    
703     my ($k, $d, $s) = @$self;
704     my $N = $self->size;
705    
706     my ($key_xml, $data_xml) = ("<n>","<d>");
707    
708     for (my $i = 0; $i <= $N; $i++) {
709     my $key = lc($k->[$i]);
710    
711     if ($key) {
712     $key_xml .= qq{<k>$key</k>};
713     $data_xml .= qq{<e>};
714     #use Data::Dumper;
715     #print Dumper($d->[$i]);
716     foreach my $path (keys %{$d->[$i]}) {
717     $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';
718     $nr_keys++;
719     }
720     $data_xml .= qq{</e>};
721     }
722    
723     $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
724     }
725    
726     $key_xml .= "</n>";
727     $data_xml .= "</d>";
728    
729     if (! -e $path) {
730     mkpath($path) || croak "can't create dir '$path': $!";
731     }
732    
733     open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
734     open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
735    
736     print K $key_xml;
737     print D $data_xml;
738    
739     close(K);
740     close(D);
741    
742     return $nr_keys;
743     }
744    
745     1;
746     __END__
747    
748     =head1 SEE ALSO
749    
750     jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
751    
752     B-Trees in perl web site L<http://perl.plover.com/BTree/>
753    
754     =head1 AUTHORS
755    
756     Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
757     base for this module
758    
759     Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
760    
761     Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
762    
763     =head1 COPYRIGHT AND LICENSE
764    
765     Copyright (C) 2004 by Dobrica Pavlinusic
766    
767     This program is free software; you can redistribute it and/or modify it
768     under the terms of the GNU General Public License as published by the Free
769     Software Foundation; either version 2 of the License, or (at your option)
770     any later version. This program is distributed in the hope that it will be
771     useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
772     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
773     Public License for more details.
774    
775     =cut

  ViewVC Help
Powered by ViewVC 1.1.26