/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (show annotations)
Tue Jul 20 17:47:30 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 15614 byte(s)
B_search documentation, updated MANIFEST

1 package jsFind;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.01';
8
9 =head1 NAME
10
11 jsFind - generate index for jsFind using B-Tree
12
13 =head1 SYNOPSIS
14
15 use jsFind;
16
17
18
19 =head1 DESCRIPTION
20
21 This module can be used to create index files for jsFind, powerful tool for
22 adding a search engine to a CDROM archive or catalog without requiring the
23 user to install anything.
24
25 Main difference between this module and scripts delivered with jsFind are:
26
27 =over 5
28
29 =item *
30
31 You don't need to use swish-e to create index
32
33 =item *
34
35 You can programatically (and incrementaly) create index for jsFind
36
37 =back
38
39 =head1 METHODS
40
41 This module contains two packages C<jsFind> and C<jsFind::Node>.
42
43 =head2 jsFind methods
44
45 =cut
46
47 use Exporter 'import';
48 use Carp;
49
50 our @ISA = qw(Exporter);
51
52 BEGIN {
53 import 'jsFind::Node';
54 }
55
56 =head3 new
57
58 Create new tree. Arguments are C<B> which is maximum numbers of keys in
59 each node and optional C<Root> node. Each root node may have child nodes.
60
61 All nodes are objects from C<jsFind::Node>.
62
63 my $t = new jsFind(B => 4);
64
65 =cut
66
67 my $DEBUG = 1;
68
69 sub new {
70 my $package = shift;
71 my %ARGV = @_;
72 croak "Usage: {$package}::new(B => number [, Root => root node ])"
73 unless exists $ARGV{B};
74 if ($ARGV{B} % 2) {
75 my $B = $ARGV{B} + 1;
76 carp "B must be an even number. Using $B instead.";
77 $ARGV{B} = $B;
78 }
79
80 my $B = $ARGV{B};
81 my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
82 bless { B => $B, Root => $Root } => $package;
83 }
84
85 =head3 B_search
86
87 Search, insert, append or replace data in B-Tree
88
89 $t->B_search(
90 Key => 'key value',
91 Data => { "path" => {
92 "t" => "title of document",
93 "f" => 99,
94 },
95 Insert => 1,
96 Append => 1,
97 );
98
99 Semantics:
100
101 If key not found, insert it iff C<Insert> argument is present.
102
103 If key B<is> found, replace existing data iff C<Replace> argument
104 is present or add new datum to existing iff C<Append> argument is present.
105
106 =cut
107
108 sub B_search {
109 my $self = shift;
110 my %args = @_;
111 my $cur_node = $self->root;
112 my $k = $args{Key};
113 my $d = $args{Data};
114 my @path;
115
116 if ($cur_node->is_empty) { # Special case for empty root
117 if ($args{Insert}) {
118 $cur_node->kdp_insert($k => $d);
119 return $d;
120 } else {
121 return undef;
122 }
123 }
124
125 # Descend tree to leaf
126 for (;;) {
127
128 # Didn't hit bottom yet.
129
130 my($there, $where) = $cur_node->locate_key($k);
131 if ($there) { # Found it!
132 if ($args{Replace}) {
133 $cur_node->kdp_replace($where, $k => $d);
134 } elsif ($args{Append}) {
135 $cur_node->kdp_append($where, $k => $d);
136 }
137 return $cur_node->data($where);
138 }
139
140 # Not here---must be in a subtree.
141
142 if ($cur_node->is_leaf) { # But there are no subtrees
143 return undef unless $args{Insert}; # Search failed
144 # Stuff it in
145 $cur_node->kdp_insert($k => $d);
146 if ($self->node_overfull($cur_node)) { # Oops--there was no room.
147 $self->split_and_promote($cur_node, @path);
148 }
149 return $d;
150 }
151
152 # There are subtrees, and the key is in one of them.
153
154 push @path, [$cur_node, $where]; # Record path from root.
155
156 # Move down to search the subtree
157 $cur_node = $cur_node->subnode($where);
158
159 # and start over.
160 } # for (;;) ...
161
162 croak ("How did I get here?");
163 }
164
165
166
167 sub split_and_promote_old {
168 my $self = shift;
169 my ($cur_node, @path) = @_;
170
171 for (;;) {
172 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
173 my ($up, $where) = @{pop @path};
174 if ($up) {
175 $up->kdp_insert(@$kdp);
176 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
177 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
178 unless $tthere;
179 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
180 unless $twhere == $where;
181 $up->subnode($where, $newleft);
182 $up->subnode($where+1, $newright);
183 return unless $self->node_overfull($up);
184 $cur_node = $up;
185 } else { # We're at the top; make a new root.
186 my $newroot = new jsFind::Node ([$kdp->[0]],
187 [$kdp->[1]],
188 [$newleft, $newright]);
189 $self->root($newroot);
190 return;
191 }
192 }
193
194 }
195
196 sub split_and_promote {
197 my $self = shift;
198 my ($cur_node, @path) = @_;
199
200 for (;;) {
201 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
202 my ($up, $where) = @{pop @path} if (@path);
203 if ($up) {
204 $up->kdp_insert(@$kdp);
205 if ($DEBUG) {
206 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
207 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
208 unless $tthere;
209 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
210 unless $twhere == $where;
211 }
212 $up->subnode($where, $newleft);
213 $up->subnode($where+1, $newright);
214 return unless $self->node_overfull($up);
215 $cur_node = $up;
216 } else { # We're at the top; make a new root.
217 my $newroot = new jsFind::Node([$kdp->[0]],
218 [$kdp->[1]],
219 [$newleft, $newright]);
220 $self->root($newroot);
221 return;
222 }
223 }
224 }
225
226 =head3 B
227
228 Return B (maximum number of keys)
229
230 my $max_size = $t->B;
231
232 =cut
233
234 sub B {
235 $_[0]{B};
236 }
237
238 =head3 root
239
240 Returns root node
241
242 my $root = $t->root;
243
244 =cut
245
246 sub root {
247 my ($self, $newroot) = @_;
248 $self->{Root} = $newroot if defined $newroot;
249 $self->{Root};
250 }
251
252 =head3 node_overfull
253
254 Returns if node is overfull
255
256 if ($node->node_overfull) { something }
257
258 =cut
259
260 sub node_overfull {
261 my $self = shift;
262 my $node = shift;
263 $node->size > $self->B;
264 }
265
266 =head3 to_string
267
268 Returns your tree as formatted string.
269
270 my $text = $root->to_string;
271
272 Mostly usefull for debugging as output leaves much to be desired.
273
274 =cut
275
276 sub to_string {
277 $_[0]->root->to_string;
278 }
279
280 =head3 to_dot
281
282 Create Graphviz graph of your tree
283
284 my $dot_graph = $root->to_dot;
285
286 =cut
287
288 sub to_dot {
289 my $self = shift;
290
291 my $dot = qq/digraph dns {\nrankdir=LR;\n/;
292 $dot .= $self->root->to_dot;
293 $dot .= qq/\n}\n/;
294
295 return $dot;
296 }
297
298 =head3 to_jsfind
299
300 Create xml index files for jsFind. This should be called after
301 your B-Tree has been filled with data.
302
303 $root->to_jsfind('/full/path/to/index/dir/');
304
305 Returns number of nodes in created tree.
306
307 =cut
308
309 sub to_jsfind {
310 my $self = shift;
311
312 my $path = shift || confess "to_jsfind need path to your index!";
313
314 $path .= "/" if ($path =~ /\/$/);
315 carp "can't create index in '$path': $!" if (! -w $path);
316
317 return $self->root->to_jsfind($path,"0");
318 }
319
320
321 # private, default cmd function
322 sub default_cmp {
323 $_[0] cmp $_[1];
324 }
325
326 #####################################################################
327
328 =head2 jsFind::Node methods
329
330 Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
331 each has C<k+1> subnodes, which might be null.
332
333 The node is a blessed reference to a list with three elements:
334
335 ($keylist, $datalist, $subnodelist)
336
337 each is a reference to a list list.
338
339 The null node is represented by a blessed reference to an empty list.
340
341 =cut
342
343 package jsFind::Node;
344
345 use warnings;
346 use strict;
347
348 use Carp;
349 use File::Path;
350
351 my $KEYS = 0;
352 my $DATA = 1;
353 my $SUBNODES = 2;
354
355 =head3 new
356
357 Create New node
358
359 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
360
361 You can also mit argument list to create empty node.
362
363 my $empty_node = new jsFind::Node;
364
365 =cut
366
367 sub new {
368 my $self = shift;
369 my $package = ref $self || $self;
370 croak "Internal error: jsFind::Node::new called with wrong number of arguments."
371 unless @_ == 3 || @_ == 0;
372 bless [@_] => $package;
373 }
374
375 =head3 locate_key
376
377 Locate key in node using linear search. This should probably be replaced
378 by binary search for better performance.
379
380 my ($found, $index) = $node->locate_key($key, $cmp_coderef);
381
382 Argument C<$cmp_coderef> is optional reference to custom comparison
383 operator.
384
385 Returns (1, $index) if $key[$index] eq $key.
386
387 Returns (0, $index) if key could be found in $subnode[$index].
388
389 In scalar context, just returns 1 or 0.
390
391 =cut
392
393 sub locate_key {
394 # Use linear search for testing, replace with binary search.
395 my $self = shift;
396 my $key = shift;
397 my $cmp = shift || \&jsFind::default_cmp;
398 my $i;
399 my $cmp_result;
400 my $N = $self->size;
401 for ($i = 0; $i < $N; $i++) {
402 $cmp_result = &$cmp($key, $self->key($i));
403 last if $cmp_result <= 0;
404 }
405
406 # $i is now the index of the first node-key greater than $key
407 # or $N if there is no such. $cmp_result is 0 iff the key was found.
408 (!$cmp_result, $i);
409 }
410
411
412 =head3 emptynode
413
414 Creates new empty node
415
416 $node = $root->emptynode;
417 $new_node = $node->emptynode;
418
419 =cut
420
421 sub emptynode {
422 new($_[0]); # Pass package name, but not anything else.
423 }
424
425 =head3 is_empty
426
427 Test if node is empty
428
429 if ($node->is_empty) { something }
430
431 =cut
432
433 # undef is empty; so is a blessed empty list.
434 sub is_empty {
435 my $self = shift;
436 !defined($self) || $#$self < 0;
437 }
438
439 =head3 key
440
441 Return C<$i>th key from node
442
443 my $key = $node->key($i);
444
445 =cut
446
447 sub key {
448 # my ($self, $n) = @_;
449 # $self->[$KEYS][$n];
450
451 # speedup
452 $_[0]->[$KEYS][$_[1]];
453 }
454
455 =head3 data
456
457 Return C<$i>th data from node
458
459 my $data = $node->data($i);
460
461 =cut
462
463 sub data {
464 my ($self, $n) = @_;
465 $self->[$DATA][$n];
466 }
467
468 =head3 kdp_replace
469
470 Set key data pair for C<$i>th element in node
471
472 $node->kdp_replace($i, "key value" => {
473 "data key 1" => "data value 1",
474 "data key 2" => "data value 2",
475 };
476
477 =cut
478
479 sub kdp_replace {
480 my ($self, $n, $k => $d) = @_;
481 if (defined $k) {
482 $self->[$KEYS][$n] = $k;
483 $self->[$DATA][$n] = $d;
484 }
485 [$self->[$KEYS][$n],
486 $self->[$DATA][$n]];
487 }
488
489 =head3 kdp_insert
490
491 # No return value.
492
493 =cut
494
495 sub kdp_insert {
496 my $self = shift;
497 my ($k => $d) = @_;
498 my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
499
500 if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
501
502 # undef fix
503 $where ||= 0;
504
505 splice(@{$self->[$KEYS]}, $where, 0, $k);
506 splice(@{$self->[$DATA]}, $where, 0, $d);
507 splice(@{$self->[$SUBNODES]}, $where, 0, undef);
508 }
509
510 =head3 kdp_append
511
512 Adds new data keys and values to C<$i>th element in node
513
514 $node->kdp_append($i, "key value" => {
515 "added data key" => "added data value",
516 };
517
518 =cut
519
520 sub kdp_append {
521 my ($self, $n, $k => $d) = @_;
522 if (defined $k) {
523 $self->[$KEYS][$n] = $k;
524 my ($kv,$dv) = %{$d};
525 $self->[$DATA][$n]->{$kv} = $dv;
526 }
527 [$self->[$KEYS][$n],
528 $self->[$DATA][$n]];
529 }
530
531 =head3 subnode
532
533 Set new or return existing subnode
534
535 # return 4th subnode
536 my $my_node = $node->subnode(4);
537
538 # create new subnode 5 from $my_node
539 $node->subnode(5, $my_node);
540
541 =cut
542
543 sub subnode {
544 my ($self, $n, $newnode) = @_;
545 $self->[$SUBNODES][$n] = $newnode if defined $newnode;
546 $self->[$SUBNODES][$n];
547 }
548
549 =head3 is_leaf
550
551 Test if node is leaf
552
553 if ($node->is_leaf) { something }
554
555 =cut
556
557 sub is_leaf {
558 my $self = shift;
559 ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
560 }
561
562 =head3 size
563
564 Return number of keys in the node
565
566 my $nr = $node->size;
567
568 =cut
569
570 sub size {
571 my $self = shift;
572 return scalar(@{$self->[$KEYS]});
573 }
574
575 =head3 halves
576
577 # Accept an index $n
578 # Divide into two nodes so that keys 0 .. $n-1 are in one node
579 # and keys $n+1 ... $size are in the other.
580
581 =cut
582
583 sub halves {
584 my $self = shift;
585 my $n = shift;
586 my $s = $self->size;
587 my @right;
588 my @left;
589
590 $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
591 $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
592 $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
593
594 $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
595 $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
596 $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
597
598 my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
599
600 ($self->new(@left), $self->new(@right), \@middle);
601 }
602
603 =head3 to_string
604
605 Dumps tree as string
606
607 my $str = $root->to_string;
608
609 =cut
610
611 sub to_string {
612 my $self = shift;
613 my $indent = shift || 0;
614 my $I = ' ' x $indent;
615 return '' if $self->is_empty;
616 my ($k, $d, $s) = @$self;
617 my $result = '';
618 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
619 my $N = $self->size;
620 my $i;
621 for ($i = 0; $i < $N; $i++) {
622 # $result .= $I . "$k->[$i] => $d->[$i]\n";
623 $result .= $I . "$k->[$i]\n";
624 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
625 }
626 $result;
627 }
628
629 =begin comment
630
631 use Data::Dumper;
632
633 sub to_string {
634 my $self = shift;
635 my $indent = shift || 0;
636 my $path = shift || '0';
637 return '' if $self->is_empty;
638 my ($k, $d, $s) = @$self;
639 my $result = '';
640 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
641 my $N = $self->size;
642 for (my $i = 0; $i < $N; $i++) {
643 my $dump = Dumper($d->[$i]);
644 $dump =~ s/[\n\r\s]+/ /gs;
645 $dump =~ s/\$VAR1\s*=\s*//;
646 $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
647 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
648 }
649 $result;
650 }
651
652 =end comment
653
654 =head3 to_dot
655
656 Recursivly walk nodes of tree
657
658 =cut
659
660 sub to_dot {
661 my $self = shift;
662 my $parent = shift;
663
664 return '' if $self->is_empty;
665
666 my $dot = '';
667
668 my ($k, $d, $s) = @$self;
669 my $N = $self->size;
670
671 my @dot_keys;
672
673 my $node_name = $parent || '_';
674 $node_name =~ s/\W+//g;
675 $node_name .= " [$N]";
676
677 for (my $i = 0; $i <= $N; $i++) {
678 if (my $key = $k->[$i]) {
679 push @dot_keys, qq{<$i>$key};
680 }
681 $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
682 }
683 push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
684
685 my $label = join("|",@dot_keys);
686 $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
687
688 $dot .= qq{$parent -> "$node_name";\n} if ($parent);
689
690 $dot;
691 }
692
693 =head3 to_jsfind
694
695 Create jsFind xml files
696
697 my $nr=$tree->to_dot('/path/to/index','0');
698
699 Returns number of elements created
700
701 =cut
702
703 sub to_jsfind {
704 my $self = shift;
705 my ($path,$file) = @_;
706
707 return 0 if $self->is_empty;
708
709 my $nr_keys = 0;
710
711 my ($k, $d, $s) = @$self;
712 my $N = $self->size;
713
714 my ($key_xml, $data_xml) = ("<n>","<d>");
715
716 for (my $i = 0; $i <= $N; $i++) {
717 my $key = lc($k->[$i]);
718
719 if ($key) {
720 $key_xml .= qq{<k>$key</k>};
721 $data_xml .= qq{<e>};
722 #use Data::Dumper;
723 #print Dumper($d->[$i]);
724 foreach my $path (keys %{$d->[$i]}) {
725 $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';
726 $nr_keys++;
727 }
728 $data_xml .= qq{</e>};
729 }
730
731 $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
732 }
733
734 $key_xml .= "</n>";
735 $data_xml .= "</d>";
736
737 if (! -e $path) {
738 mkpath($path) || croak "can't create dir '$path': $!";
739 }
740
741 open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
742 open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
743
744 print K $key_xml;
745 print D $data_xml;
746
747 close(K);
748 close(D);
749
750 return $nr_keys;
751 }
752
753 1;
754 __END__
755
756 =head1 SEE ALSO
757
758 jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
759
760 B-Trees in perl web site L<http://perl.plover.com/BTree/>
761
762 =head1 AUTHORS
763
764 Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
765 base for this module
766
767 Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
768
769 Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
770
771 =head1 COPYRIGHT AND LICENSE
772
773 Copyright (C) 2004 by Dobrica Pavlinusic
774
775 This program is free software; you can redistribute it and/or modify it
776 under the terms of the GNU General Public License as published by the Free
777 Software Foundation; either version 2 of the License, or (at your option)
778 any later version. This program is distributed in the hope that it will be
779 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
780 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
781 Public License for more details.
782
783 =cut

  ViewVC Help
Powered by ViewVC 1.1.26