/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 9 - (hide annotations)
Wed Jul 21 23:37:49 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 16398 byte(s)
Version 0.02: API extension: to_jsfind now accepts also data and xml
encoding as optional parametars

1 dpavlin 1 package jsFind;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 9 our $VERSION = '0.02';
8 dpavlin 1
9     =head1 NAME
10    
11     jsFind - generate index for jsFind using B-Tree
12    
13     =head1 SYNOPSIS
14    
15     use jsFind;
16    
17    
18    
19     =head1 DESCRIPTION
20    
21     This module can be used to create index files for jsFind, powerful tool for
22     adding a search engine to a CDROM archive or catalog without requiring the
23     user to install anything.
24    
25     Main difference between this module and scripts delivered with jsFind are:
26    
27     =over 5
28    
29     =item *
30    
31     You don't need to use swish-e to create index
32    
33     =item *
34    
35     You can programatically (and incrementaly) create index for jsFind
36    
37     =back
38    
39     =head1 METHODS
40    
41     This module contains two packages C<jsFind> and C<jsFind::Node>.
42    
43     =head2 jsFind methods
44    
45     =cut
46    
47     use Exporter 'import';
48     use Carp;
49    
50     our @ISA = qw(Exporter);
51    
52     BEGIN {
53     import 'jsFind::Node';
54     }
55    
56     =head3 new
57    
58     Create new tree. Arguments are C<B> which is maximum numbers of keys in
59     each node and optional C<Root> node. Each root node may have child nodes.
60    
61     All nodes are objects from C<jsFind::Node>.
62    
63     my $t = new jsFind(B => 4);
64    
65     =cut
66    
67     my $DEBUG = 1;
68    
69     sub new {
70     my $package = shift;
71     my %ARGV = @_;
72     croak "Usage: {$package}::new(B => number [, Root => root node ])"
73     unless exists $ARGV{B};
74     if ($ARGV{B} % 2) {
75     my $B = $ARGV{B} + 1;
76     carp "B must be an even number. Using $B instead.";
77     $ARGV{B} = $B;
78     }
79    
80     my $B = $ARGV{B};
81     my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
82     bless { B => $B, Root => $Root } => $package;
83     }
84    
85     =head3 B_search
86    
87     Search, insert, append or replace data in B-Tree
88    
89 dpavlin 5 $t->B_search(
90     Key => 'key value',
91     Data => { "path" => {
92     "t" => "title of document",
93     "f" => 99,
94 dpavlin 7 },
95 dpavlin 5 },
96     Insert => 1,
97     Append => 1,
98     );
99 dpavlin 1
100     Semantics:
101    
102     If key not found, insert it iff C<Insert> argument is present.
103    
104     If key B<is> found, replace existing data iff C<Replace> argument
105     is present or add new datum to existing iff C<Append> argument is present.
106    
107     =cut
108    
109     sub B_search {
110     my $self = shift;
111     my %args = @_;
112     my $cur_node = $self->root;
113     my $k = $args{Key};
114     my $d = $args{Data};
115     my @path;
116    
117     if ($cur_node->is_empty) { # Special case for empty root
118     if ($args{Insert}) {
119     $cur_node->kdp_insert($k => $d);
120     return $d;
121     } else {
122     return undef;
123     }
124     }
125    
126     # Descend tree to leaf
127     for (;;) {
128    
129     # Didn't hit bottom yet.
130    
131     my($there, $where) = $cur_node->locate_key($k);
132     if ($there) { # Found it!
133     if ($args{Replace}) {
134     $cur_node->kdp_replace($where, $k => $d);
135     } elsif ($args{Append}) {
136     $cur_node->kdp_append($where, $k => $d);
137     }
138     return $cur_node->data($where);
139     }
140    
141     # Not here---must be in a subtree.
142    
143     if ($cur_node->is_leaf) { # But there are no subtrees
144     return undef unless $args{Insert}; # Search failed
145     # Stuff it in
146     $cur_node->kdp_insert($k => $d);
147     if ($self->node_overfull($cur_node)) { # Oops--there was no room.
148     $self->split_and_promote($cur_node, @path);
149     }
150     return $d;
151     }
152    
153     # There are subtrees, and the key is in one of them.
154    
155     push @path, [$cur_node, $where]; # Record path from root.
156    
157     # Move down to search the subtree
158     $cur_node = $cur_node->subnode($where);
159    
160     # and start over.
161     } # for (;;) ...
162    
163     croak ("How did I get here?");
164     }
165    
166    
167    
168     sub split_and_promote_old {
169     my $self = shift;
170     my ($cur_node, @path) = @_;
171    
172     for (;;) {
173     my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
174     my ($up, $where) = @{pop @path};
175     if ($up) {
176     $up->kdp_insert(@$kdp);
177     my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
178     croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
179     unless $tthere;
180     croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
181     unless $twhere == $where;
182     $up->subnode($where, $newleft);
183     $up->subnode($where+1, $newright);
184     return unless $self->node_overfull($up);
185     $cur_node = $up;
186     } else { # We're at the top; make a new root.
187     my $newroot = new jsFind::Node ([$kdp->[0]],
188     [$kdp->[1]],
189     [$newleft, $newright]);
190     $self->root($newroot);
191     return;
192     }
193     }
194    
195     }
196    
197     sub split_and_promote {
198     my $self = shift;
199     my ($cur_node, @path) = @_;
200    
201     for (;;) {
202     my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
203     my ($up, $where) = @{pop @path} if (@path);
204     if ($up) {
205     $up->kdp_insert(@$kdp);
206     if ($DEBUG) {
207     my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
208     croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
209     unless $tthere;
210     croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
211     unless $twhere == $where;
212     }
213     $up->subnode($where, $newleft);
214     $up->subnode($where+1, $newright);
215     return unless $self->node_overfull($up);
216     $cur_node = $up;
217     } else { # We're at the top; make a new root.
218     my $newroot = new jsFind::Node([$kdp->[0]],
219     [$kdp->[1]],
220     [$newleft, $newright]);
221     $self->root($newroot);
222     return;
223     }
224     }
225     }
226    
227     =head3 B
228    
229     Return B (maximum number of keys)
230    
231     my $max_size = $t->B;
232    
233     =cut
234    
235     sub B {
236     $_[0]{B};
237     }
238    
239     =head3 root
240    
241     Returns root node
242    
243     my $root = $t->root;
244    
245     =cut
246    
247     sub root {
248     my ($self, $newroot) = @_;
249     $self->{Root} = $newroot if defined $newroot;
250     $self->{Root};
251     }
252    
253     =head3 node_overfull
254    
255     Returns if node is overfull
256    
257     if ($node->node_overfull) { something }
258    
259     =cut
260    
261     sub node_overfull {
262     my $self = shift;
263     my $node = shift;
264     $node->size > $self->B;
265     }
266    
267     =head3 to_string
268    
269     Returns your tree as formatted string.
270    
271     my $text = $root->to_string;
272    
273     Mostly usefull for debugging as output leaves much to be desired.
274    
275     =cut
276    
277     sub to_string {
278     $_[0]->root->to_string;
279     }
280    
281     =head3 to_dot
282    
283     Create Graphviz graph of your tree
284    
285     my $dot_graph = $root->to_dot;
286    
287     =cut
288    
289     sub to_dot {
290     my $self = shift;
291    
292     my $dot = qq/digraph dns {\nrankdir=LR;\n/;
293     $dot .= $self->root->to_dot;
294     $dot .= qq/\n}\n/;
295    
296     return $dot;
297     }
298    
299     =head3 to_jsfind
300    
301     Create xml index files for jsFind. This should be called after
302     your B-Tree has been filled with data.
303    
304     $root->to_jsfind('/full/path/to/index/dir/');
305    
306     Returns number of nodes in created tree.
307    
308 dpavlin 9 There is also longer version if you want to recode your data charset
309     into different one (probably UTF-8):
310    
311     $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
312    
313 dpavlin 1 =cut
314    
315 dpavlin 9 my $iconv;
316    
317 dpavlin 1 sub to_jsfind {
318     my $self = shift;
319    
320     my $path = shift || confess "to_jsfind need path to your index!";
321    
322 dpavlin 9 my ($from_cp,$to_cp) = @_;
323     if ($from_cp && $to_cp) {
324     $iconv = Text::Iconv->new($from_cp,$to_cp);
325     }
326    
327 dpavlin 1 $path .= "/" if ($path =~ /\/$/);
328 dpavlin 7 carp "create directory for index '$path': $!" if (! -w $path);
329 dpavlin 1
330     return $self->root->to_jsfind($path,"0");
331     }
332    
333    
334     # private, default cmd function
335     sub default_cmp {
336     $_[0] cmp $_[1];
337     }
338    
339 dpavlin 9 =head3 _recode
340    
341     This is internal function to recode charset.
342    
343     =cut
344    
345     sub _recode {
346     my $self = shift;
347     my $text = shift || return;
348    
349     if ($iconv) {
350     return $iconv->convert($text) || $text;
351     } else {
352     return $text;
353     }
354     }
355    
356 dpavlin 1 #####################################################################
357    
358     =head2 jsFind::Node methods
359    
360     Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
361     each has C<k+1> subnodes, which might be null.
362    
363     The node is a blessed reference to a list with three elements:
364    
365     ($keylist, $datalist, $subnodelist)
366    
367     each is a reference to a list list.
368    
369     The null node is represented by a blessed reference to an empty list.
370    
371     =cut
372    
373     package jsFind::Node;
374    
375     use warnings;
376     use strict;
377    
378     use Carp;
379     use File::Path;
380 dpavlin 9 use Text::Iconv;
381 dpavlin 1
382 dpavlin 9 use base 'jsFind';
383    
384 dpavlin 1 my $KEYS = 0;
385     my $DATA = 1;
386     my $SUBNODES = 2;
387    
388     =head3 new
389    
390     Create New node
391    
392     my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
393    
394     You can also mit argument list to create empty node.
395    
396     my $empty_node = new jsFind::Node;
397    
398     =cut
399    
400     sub new {
401     my $self = shift;
402     my $package = ref $self || $self;
403     croak "Internal error: jsFind::Node::new called with wrong number of arguments."
404     unless @_ == 3 || @_ == 0;
405     bless [@_] => $package;
406     }
407    
408     =head3 locate_key
409    
410     Locate key in node using linear search. This should probably be replaced
411     by binary search for better performance.
412    
413     my ($found, $index) = $node->locate_key($key, $cmp_coderef);
414    
415     Argument C<$cmp_coderef> is optional reference to custom comparison
416     operator.
417    
418     Returns (1, $index) if $key[$index] eq $key.
419    
420     Returns (0, $index) if key could be found in $subnode[$index].
421    
422     In scalar context, just returns 1 or 0.
423    
424     =cut
425    
426     sub locate_key {
427     # Use linear search for testing, replace with binary search.
428     my $self = shift;
429     my $key = shift;
430     my $cmp = shift || \&jsFind::default_cmp;
431     my $i;
432     my $cmp_result;
433     my $N = $self->size;
434     for ($i = 0; $i < $N; $i++) {
435     $cmp_result = &$cmp($key, $self->key($i));
436     last if $cmp_result <= 0;
437     }
438    
439     # $i is now the index of the first node-key greater than $key
440     # or $N if there is no such. $cmp_result is 0 iff the key was found.
441     (!$cmp_result, $i);
442     }
443    
444    
445     =head3 emptynode
446    
447     Creates new empty node
448    
449     $node = $root->emptynode;
450     $new_node = $node->emptynode;
451    
452     =cut
453    
454     sub emptynode {
455     new($_[0]); # Pass package name, but not anything else.
456     }
457    
458     =head3 is_empty
459    
460     Test if node is empty
461    
462     if ($node->is_empty) { something }
463    
464     =cut
465    
466     # undef is empty; so is a blessed empty list.
467     sub is_empty {
468     my $self = shift;
469     !defined($self) || $#$self < 0;
470     }
471    
472     =head3 key
473    
474     Return C<$i>th key from node
475    
476     my $key = $node->key($i);
477    
478     =cut
479    
480     sub key {
481     # my ($self, $n) = @_;
482     # $self->[$KEYS][$n];
483    
484     # speedup
485     $_[0]->[$KEYS][$_[1]];
486     }
487    
488     =head3 data
489    
490     Return C<$i>th data from node
491    
492     my $data = $node->data($i);
493    
494     =cut
495    
496     sub data {
497     my ($self, $n) = @_;
498     $self->[$DATA][$n];
499     }
500    
501     =head3 kdp_replace
502    
503     Set key data pair for C<$i>th element in node
504    
505     $node->kdp_replace($i, "key value" => {
506     "data key 1" => "data value 1",
507     "data key 2" => "data value 2",
508     };
509    
510     =cut
511    
512     sub kdp_replace {
513     my ($self, $n, $k => $d) = @_;
514     if (defined $k) {
515     $self->[$KEYS][$n] = $k;
516     $self->[$DATA][$n] = $d;
517     }
518     [$self->[$KEYS][$n],
519     $self->[$DATA][$n]];
520     }
521    
522     =head3 kdp_insert
523    
524     # No return value.
525    
526     =cut
527    
528     sub kdp_insert {
529     my $self = shift;
530     my ($k => $d) = @_;
531     my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
532    
533     if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
534    
535     # undef fix
536     $where ||= 0;
537    
538     splice(@{$self->[$KEYS]}, $where, 0, $k);
539     splice(@{$self->[$DATA]}, $where, 0, $d);
540     splice(@{$self->[$SUBNODES]}, $where, 0, undef);
541     }
542    
543     =head3 kdp_append
544    
545     Adds new data keys and values to C<$i>th element in node
546    
547     $node->kdp_append($i, "key value" => {
548     "added data key" => "added data value",
549     };
550    
551     =cut
552    
553     sub kdp_append {
554     my ($self, $n, $k => $d) = @_;
555     if (defined $k) {
556     $self->[$KEYS][$n] = $k;
557     my ($kv,$dv) = %{$d};
558     $self->[$DATA][$n]->{$kv} = $dv;
559     }
560     [$self->[$KEYS][$n],
561     $self->[$DATA][$n]];
562     }
563    
564     =head3 subnode
565    
566     Set new or return existing subnode
567    
568     # return 4th subnode
569     my $my_node = $node->subnode(4);
570    
571     # create new subnode 5 from $my_node
572     $node->subnode(5, $my_node);
573    
574     =cut
575    
576     sub subnode {
577     my ($self, $n, $newnode) = @_;
578     $self->[$SUBNODES][$n] = $newnode if defined $newnode;
579     $self->[$SUBNODES][$n];
580     }
581    
582     =head3 is_leaf
583    
584     Test if node is leaf
585    
586     if ($node->is_leaf) { something }
587    
588     =cut
589    
590     sub is_leaf {
591     my $self = shift;
592     ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
593     }
594    
595     =head3 size
596    
597     Return number of keys in the node
598    
599     my $nr = $node->size;
600    
601     =cut
602    
603     sub size {
604     my $self = shift;
605     return scalar(@{$self->[$KEYS]});
606     }
607    
608     =head3 halves
609    
610     # Accept an index $n
611     # Divide into two nodes so that keys 0 .. $n-1 are in one node
612     # and keys $n+1 ... $size are in the other.
613    
614     =cut
615    
616     sub halves {
617     my $self = shift;
618     my $n = shift;
619     my $s = $self->size;
620     my @right;
621     my @left;
622    
623     $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
624     $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
625     $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
626    
627     $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
628     $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
629     $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
630    
631     my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
632    
633     ($self->new(@left), $self->new(@right), \@middle);
634     }
635    
636     =head3 to_string
637    
638     Dumps tree as string
639    
640     my $str = $root->to_string;
641    
642     =cut
643    
644     sub to_string {
645     my $self = shift;
646     my $indent = shift || 0;
647     my $I = ' ' x $indent;
648     return '' if $self->is_empty;
649     my ($k, $d, $s) = @$self;
650     my $result = '';
651     $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
652     my $N = $self->size;
653     my $i;
654     for ($i = 0; $i < $N; $i++) {
655     # $result .= $I . "$k->[$i] => $d->[$i]\n";
656     $result .= $I . "$k->[$i]\n";
657     $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
658     }
659     $result;
660     }
661    
662     =begin comment
663    
664     use Data::Dumper;
665    
666     sub to_string {
667     my $self = shift;
668     my $indent = shift || 0;
669     my $path = shift || '0';
670     return '' if $self->is_empty;
671     my ($k, $d, $s) = @$self;
672     my $result = '';
673     $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
674     my $N = $self->size;
675     for (my $i = 0; $i < $N; $i++) {
676     my $dump = Dumper($d->[$i]);
677     $dump =~ s/[\n\r\s]+/ /gs;
678     $dump =~ s/\$VAR1\s*=\s*//;
679     $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
680     $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
681     }
682     $result;
683     }
684    
685     =end comment
686    
687     =head3 to_dot
688    
689     Recursivly walk nodes of tree
690    
691     =cut
692    
693     sub to_dot {
694     my $self = shift;
695     my $parent = shift;
696    
697     return '' if $self->is_empty;
698    
699     my $dot = '';
700    
701     my ($k, $d, $s) = @$self;
702     my $N = $self->size;
703    
704     my @dot_keys;
705    
706     my $node_name = $parent || '_';
707     $node_name =~ s/\W+//g;
708     $node_name .= " [$N]";
709    
710     for (my $i = 0; $i <= $N; $i++) {
711     if (my $key = $k->[$i]) {
712     push @dot_keys, qq{<$i>$key};
713     }
714     $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
715     }
716     push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
717    
718     my $label = join("|",@dot_keys);
719     $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
720    
721     $dot .= qq{$parent -> "$node_name";\n} if ($parent);
722    
723     $dot;
724     }
725    
726     =head3 to_jsfind
727    
728     Create jsFind xml files
729    
730 dpavlin 8 my $nr=$tree->to_jsfind('/path/to/index','0');
731 dpavlin 1
732     Returns number of elements created
733    
734     =cut
735    
736     sub to_jsfind {
737     my $self = shift;
738     my ($path,$file) = @_;
739    
740     return 0 if $self->is_empty;
741    
742 dpavlin 8 confess("path is undefined.") unless ($path);
743     confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
744    
745 dpavlin 1 my $nr_keys = 0;
746    
747     my ($k, $d, $s) = @$self;
748     my $N = $self->size;
749    
750     my ($key_xml, $data_xml) = ("<n>","<d>");
751    
752     for (my $i = 0; $i <= $N; $i++) {
753     my $key = lc($k->[$i]);
754    
755     if ($key) {
756     $key_xml .= qq{<k>$key</k>};
757     $data_xml .= qq{<e>};
758     #use Data::Dumper;
759     #print Dumper($d->[$i]);
760     foreach my $path (keys %{$d->[$i]}) {
761     $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';
762     $nr_keys++;
763     }
764     $data_xml .= qq{</e>};
765     }
766    
767     $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
768     }
769    
770     $key_xml .= "</n>";
771     $data_xml .= "</d>";
772    
773     if (! -e $path) {
774     mkpath($path) || croak "can't create dir '$path': $!";
775     }
776    
777     open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
778     open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
779    
780 dpavlin 9 print K $self->SUPER::_recode($key_xml);
781     print D $self->SUPER::_recode($data_xml);
782 dpavlin 1
783     close(K);
784     close(D);
785    
786     return $nr_keys;
787     }
788    
789     1;
790     __END__
791    
792     =head1 SEE ALSO
793    
794     jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
795    
796     B-Trees in perl web site L<http://perl.plover.com/BTree/>
797    
798     =head1 AUTHORS
799    
800     Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
801     base for this module
802    
803     Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
804    
805     Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
806    
807     =head1 COPYRIGHT AND LICENSE
808    
809     Copyright (C) 2004 by Dobrica Pavlinusic
810    
811     This program is free software; you can redistribute it and/or modify it
812     under the terms of the GNU General Public License as published by the Free
813     Software Foundation; either version 2 of the License, or (at your option)
814     any later version. This program is distributed in the hope that it will be
815     useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
816     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
817     Public License for more details.
818    
819     =cut

  ViewVC Help
Powered by ViewVC 1.1.26