/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 9 - (show annotations)
Wed Jul 21 23:37:49 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 16398 byte(s)
Version 0.02: API extension: to_jsfind now accepts also data and xml
encoding as optional parametars

1 package jsFind;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.02';
8
9 =head1 NAME
10
11 jsFind - generate index for jsFind using B-Tree
12
13 =head1 SYNOPSIS
14
15 use jsFind;
16
17
18
19 =head1 DESCRIPTION
20
21 This module can be used to create index files for jsFind, powerful tool for
22 adding a search engine to a CDROM archive or catalog without requiring the
23 user to install anything.
24
25 Main difference between this module and scripts delivered with jsFind are:
26
27 =over 5
28
29 =item *
30
31 You don't need to use swish-e to create index
32
33 =item *
34
35 You can programatically (and incrementaly) create index for jsFind
36
37 =back
38
39 =head1 METHODS
40
41 This module contains two packages C<jsFind> and C<jsFind::Node>.
42
43 =head2 jsFind methods
44
45 =cut
46
47 use Exporter 'import';
48 use Carp;
49
50 our @ISA = qw(Exporter);
51
52 BEGIN {
53 import 'jsFind::Node';
54 }
55
56 =head3 new
57
58 Create new tree. Arguments are C<B> which is maximum numbers of keys in
59 each node and optional C<Root> node. Each root node may have child nodes.
60
61 All nodes are objects from C<jsFind::Node>.
62
63 my $t = new jsFind(B => 4);
64
65 =cut
66
67 my $DEBUG = 1;
68
69 sub new {
70 my $package = shift;
71 my %ARGV = @_;
72 croak "Usage: {$package}::new(B => number [, Root => root node ])"
73 unless exists $ARGV{B};
74 if ($ARGV{B} % 2) {
75 my $B = $ARGV{B} + 1;
76 carp "B must be an even number. Using $B instead.";
77 $ARGV{B} = $B;
78 }
79
80 my $B = $ARGV{B};
81 my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
82 bless { B => $B, Root => $Root } => $package;
83 }
84
85 =head3 B_search
86
87 Search, insert, append or replace data in B-Tree
88
89 $t->B_search(
90 Key => 'key value',
91 Data => { "path" => {
92 "t" => "title of document",
93 "f" => 99,
94 },
95 },
96 Insert => 1,
97 Append => 1,
98 );
99
100 Semantics:
101
102 If key not found, insert it iff C<Insert> argument is present.
103
104 If key B<is> found, replace existing data iff C<Replace> argument
105 is present or add new datum to existing iff C<Append> argument is present.
106
107 =cut
108
109 sub B_search {
110 my $self = shift;
111 my %args = @_;
112 my $cur_node = $self->root;
113 my $k = $args{Key};
114 my $d = $args{Data};
115 my @path;
116
117 if ($cur_node->is_empty) { # Special case for empty root
118 if ($args{Insert}) {
119 $cur_node->kdp_insert($k => $d);
120 return $d;
121 } else {
122 return undef;
123 }
124 }
125
126 # Descend tree to leaf
127 for (;;) {
128
129 # Didn't hit bottom yet.
130
131 my($there, $where) = $cur_node->locate_key($k);
132 if ($there) { # Found it!
133 if ($args{Replace}) {
134 $cur_node->kdp_replace($where, $k => $d);
135 } elsif ($args{Append}) {
136 $cur_node->kdp_append($where, $k => $d);
137 }
138 return $cur_node->data($where);
139 }
140
141 # Not here---must be in a subtree.
142
143 if ($cur_node->is_leaf) { # But there are no subtrees
144 return undef unless $args{Insert}; # Search failed
145 # Stuff it in
146 $cur_node->kdp_insert($k => $d);
147 if ($self->node_overfull($cur_node)) { # Oops--there was no room.
148 $self->split_and_promote($cur_node, @path);
149 }
150 return $d;
151 }
152
153 # There are subtrees, and the key is in one of them.
154
155 push @path, [$cur_node, $where]; # Record path from root.
156
157 # Move down to search the subtree
158 $cur_node = $cur_node->subnode($where);
159
160 # and start over.
161 } # for (;;) ...
162
163 croak ("How did I get here?");
164 }
165
166
167
168 sub split_and_promote_old {
169 my $self = shift;
170 my ($cur_node, @path) = @_;
171
172 for (;;) {
173 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
174 my ($up, $where) = @{pop @path};
175 if ($up) {
176 $up->kdp_insert(@$kdp);
177 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
178 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
179 unless $tthere;
180 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
181 unless $twhere == $where;
182 $up->subnode($where, $newleft);
183 $up->subnode($where+1, $newright);
184 return unless $self->node_overfull($up);
185 $cur_node = $up;
186 } else { # We're at the top; make a new root.
187 my $newroot = new jsFind::Node ([$kdp->[0]],
188 [$kdp->[1]],
189 [$newleft, $newright]);
190 $self->root($newroot);
191 return;
192 }
193 }
194
195 }
196
197 sub split_and_promote {
198 my $self = shift;
199 my ($cur_node, @path) = @_;
200
201 for (;;) {
202 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
203 my ($up, $where) = @{pop @path} if (@path);
204 if ($up) {
205 $up->kdp_insert(@$kdp);
206 if ($DEBUG) {
207 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
208 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
209 unless $tthere;
210 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
211 unless $twhere == $where;
212 }
213 $up->subnode($where, $newleft);
214 $up->subnode($where+1, $newright);
215 return unless $self->node_overfull($up);
216 $cur_node = $up;
217 } else { # We're at the top; make a new root.
218 my $newroot = new jsFind::Node([$kdp->[0]],
219 [$kdp->[1]],
220 [$newleft, $newright]);
221 $self->root($newroot);
222 return;
223 }
224 }
225 }
226
227 =head3 B
228
229 Return B (maximum number of keys)
230
231 my $max_size = $t->B;
232
233 =cut
234
235 sub B {
236 $_[0]{B};
237 }
238
239 =head3 root
240
241 Returns root node
242
243 my $root = $t->root;
244
245 =cut
246
247 sub root {
248 my ($self, $newroot) = @_;
249 $self->{Root} = $newroot if defined $newroot;
250 $self->{Root};
251 }
252
253 =head3 node_overfull
254
255 Returns if node is overfull
256
257 if ($node->node_overfull) { something }
258
259 =cut
260
261 sub node_overfull {
262 my $self = shift;
263 my $node = shift;
264 $node->size > $self->B;
265 }
266
267 =head3 to_string
268
269 Returns your tree as formatted string.
270
271 my $text = $root->to_string;
272
273 Mostly usefull for debugging as output leaves much to be desired.
274
275 =cut
276
277 sub to_string {
278 $_[0]->root->to_string;
279 }
280
281 =head3 to_dot
282
283 Create Graphviz graph of your tree
284
285 my $dot_graph = $root->to_dot;
286
287 =cut
288
289 sub to_dot {
290 my $self = shift;
291
292 my $dot = qq/digraph dns {\nrankdir=LR;\n/;
293 $dot .= $self->root->to_dot;
294 $dot .= qq/\n}\n/;
295
296 return $dot;
297 }
298
299 =head3 to_jsfind
300
301 Create xml index files for jsFind. This should be called after
302 your B-Tree has been filled with data.
303
304 $root->to_jsfind('/full/path/to/index/dir/');
305
306 Returns number of nodes in created tree.
307
308 There is also longer version if you want to recode your data charset
309 into different one (probably UTF-8):
310
311 $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
312
313 =cut
314
315 my $iconv;
316
317 sub to_jsfind {
318 my $self = shift;
319
320 my $path = shift || confess "to_jsfind need path to your index!";
321
322 my ($from_cp,$to_cp) = @_;
323 if ($from_cp && $to_cp) {
324 $iconv = Text::Iconv->new($from_cp,$to_cp);
325 }
326
327 $path .= "/" if ($path =~ /\/$/);
328 carp "create directory for index '$path': $!" if (! -w $path);
329
330 return $self->root->to_jsfind($path,"0");
331 }
332
333
334 # private, default cmd function
335 sub default_cmp {
336 $_[0] cmp $_[1];
337 }
338
339 =head3 _recode
340
341 This is internal function to recode charset.
342
343 =cut
344
345 sub _recode {
346 my $self = shift;
347 my $text = shift || return;
348
349 if ($iconv) {
350 return $iconv->convert($text) || $text;
351 } else {
352 return $text;
353 }
354 }
355
356 #####################################################################
357
358 =head2 jsFind::Node methods
359
360 Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
361 each has C<k+1> subnodes, which might be null.
362
363 The node is a blessed reference to a list with three elements:
364
365 ($keylist, $datalist, $subnodelist)
366
367 each is a reference to a list list.
368
369 The null node is represented by a blessed reference to an empty list.
370
371 =cut
372
373 package jsFind::Node;
374
375 use warnings;
376 use strict;
377
378 use Carp;
379 use File::Path;
380 use Text::Iconv;
381
382 use base 'jsFind';
383
384 my $KEYS = 0;
385 my $DATA = 1;
386 my $SUBNODES = 2;
387
388 =head3 new
389
390 Create New node
391
392 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
393
394 You can also mit argument list to create empty node.
395
396 my $empty_node = new jsFind::Node;
397
398 =cut
399
400 sub new {
401 my $self = shift;
402 my $package = ref $self || $self;
403 croak "Internal error: jsFind::Node::new called with wrong number of arguments."
404 unless @_ == 3 || @_ == 0;
405 bless [@_] => $package;
406 }
407
408 =head3 locate_key
409
410 Locate key in node using linear search. This should probably be replaced
411 by binary search for better performance.
412
413 my ($found, $index) = $node->locate_key($key, $cmp_coderef);
414
415 Argument C<$cmp_coderef> is optional reference to custom comparison
416 operator.
417
418 Returns (1, $index) if $key[$index] eq $key.
419
420 Returns (0, $index) if key could be found in $subnode[$index].
421
422 In scalar context, just returns 1 or 0.
423
424 =cut
425
426 sub locate_key {
427 # Use linear search for testing, replace with binary search.
428 my $self = shift;
429 my $key = shift;
430 my $cmp = shift || \&jsFind::default_cmp;
431 my $i;
432 my $cmp_result;
433 my $N = $self->size;
434 for ($i = 0; $i < $N; $i++) {
435 $cmp_result = &$cmp($key, $self->key($i));
436 last if $cmp_result <= 0;
437 }
438
439 # $i is now the index of the first node-key greater than $key
440 # or $N if there is no such. $cmp_result is 0 iff the key was found.
441 (!$cmp_result, $i);
442 }
443
444
445 =head3 emptynode
446
447 Creates new empty node
448
449 $node = $root->emptynode;
450 $new_node = $node->emptynode;
451
452 =cut
453
454 sub emptynode {
455 new($_[0]); # Pass package name, but not anything else.
456 }
457
458 =head3 is_empty
459
460 Test if node is empty
461
462 if ($node->is_empty) { something }
463
464 =cut
465
466 # undef is empty; so is a blessed empty list.
467 sub is_empty {
468 my $self = shift;
469 !defined($self) || $#$self < 0;
470 }
471
472 =head3 key
473
474 Return C<$i>th key from node
475
476 my $key = $node->key($i);
477
478 =cut
479
480 sub key {
481 # my ($self, $n) = @_;
482 # $self->[$KEYS][$n];
483
484 # speedup
485 $_[0]->[$KEYS][$_[1]];
486 }
487
488 =head3 data
489
490 Return C<$i>th data from node
491
492 my $data = $node->data($i);
493
494 =cut
495
496 sub data {
497 my ($self, $n) = @_;
498 $self->[$DATA][$n];
499 }
500
501 =head3 kdp_replace
502
503 Set key data pair for C<$i>th element in node
504
505 $node->kdp_replace($i, "key value" => {
506 "data key 1" => "data value 1",
507 "data key 2" => "data value 2",
508 };
509
510 =cut
511
512 sub kdp_replace {
513 my ($self, $n, $k => $d) = @_;
514 if (defined $k) {
515 $self->[$KEYS][$n] = $k;
516 $self->[$DATA][$n] = $d;
517 }
518 [$self->[$KEYS][$n],
519 $self->[$DATA][$n]];
520 }
521
522 =head3 kdp_insert
523
524 # No return value.
525
526 =cut
527
528 sub kdp_insert {
529 my $self = shift;
530 my ($k => $d) = @_;
531 my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
532
533 if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
534
535 # undef fix
536 $where ||= 0;
537
538 splice(@{$self->[$KEYS]}, $where, 0, $k);
539 splice(@{$self->[$DATA]}, $where, 0, $d);
540 splice(@{$self->[$SUBNODES]}, $where, 0, undef);
541 }
542
543 =head3 kdp_append
544
545 Adds new data keys and values to C<$i>th element in node
546
547 $node->kdp_append($i, "key value" => {
548 "added data key" => "added data value",
549 };
550
551 =cut
552
553 sub kdp_append {
554 my ($self, $n, $k => $d) = @_;
555 if (defined $k) {
556 $self->[$KEYS][$n] = $k;
557 my ($kv,$dv) = %{$d};
558 $self->[$DATA][$n]->{$kv} = $dv;
559 }
560 [$self->[$KEYS][$n],
561 $self->[$DATA][$n]];
562 }
563
564 =head3 subnode
565
566 Set new or return existing subnode
567
568 # return 4th subnode
569 my $my_node = $node->subnode(4);
570
571 # create new subnode 5 from $my_node
572 $node->subnode(5, $my_node);
573
574 =cut
575
576 sub subnode {
577 my ($self, $n, $newnode) = @_;
578 $self->[$SUBNODES][$n] = $newnode if defined $newnode;
579 $self->[$SUBNODES][$n];
580 }
581
582 =head3 is_leaf
583
584 Test if node is leaf
585
586 if ($node->is_leaf) { something }
587
588 =cut
589
590 sub is_leaf {
591 my $self = shift;
592 ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
593 }
594
595 =head3 size
596
597 Return number of keys in the node
598
599 my $nr = $node->size;
600
601 =cut
602
603 sub size {
604 my $self = shift;
605 return scalar(@{$self->[$KEYS]});
606 }
607
608 =head3 halves
609
610 # Accept an index $n
611 # Divide into two nodes so that keys 0 .. $n-1 are in one node
612 # and keys $n+1 ... $size are in the other.
613
614 =cut
615
616 sub halves {
617 my $self = shift;
618 my $n = shift;
619 my $s = $self->size;
620 my @right;
621 my @left;
622
623 $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
624 $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
625 $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
626
627 $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
628 $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
629 $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
630
631 my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
632
633 ($self->new(@left), $self->new(@right), \@middle);
634 }
635
636 =head3 to_string
637
638 Dumps tree as string
639
640 my $str = $root->to_string;
641
642 =cut
643
644 sub to_string {
645 my $self = shift;
646 my $indent = shift || 0;
647 my $I = ' ' x $indent;
648 return '' if $self->is_empty;
649 my ($k, $d, $s) = @$self;
650 my $result = '';
651 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
652 my $N = $self->size;
653 my $i;
654 for ($i = 0; $i < $N; $i++) {
655 # $result .= $I . "$k->[$i] => $d->[$i]\n";
656 $result .= $I . "$k->[$i]\n";
657 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
658 }
659 $result;
660 }
661
662 =begin comment
663
664 use Data::Dumper;
665
666 sub to_string {
667 my $self = shift;
668 my $indent = shift || 0;
669 my $path = shift || '0';
670 return '' if $self->is_empty;
671 my ($k, $d, $s) = @$self;
672 my $result = '';
673 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
674 my $N = $self->size;
675 for (my $i = 0; $i < $N; $i++) {
676 my $dump = Dumper($d->[$i]);
677 $dump =~ s/[\n\r\s]+/ /gs;
678 $dump =~ s/\$VAR1\s*=\s*//;
679 $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
680 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
681 }
682 $result;
683 }
684
685 =end comment
686
687 =head3 to_dot
688
689 Recursivly walk nodes of tree
690
691 =cut
692
693 sub to_dot {
694 my $self = shift;
695 my $parent = shift;
696
697 return '' if $self->is_empty;
698
699 my $dot = '';
700
701 my ($k, $d, $s) = @$self;
702 my $N = $self->size;
703
704 my @dot_keys;
705
706 my $node_name = $parent || '_';
707 $node_name =~ s/\W+//g;
708 $node_name .= " [$N]";
709
710 for (my $i = 0; $i <= $N; $i++) {
711 if (my $key = $k->[$i]) {
712 push @dot_keys, qq{<$i>$key};
713 }
714 $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
715 }
716 push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
717
718 my $label = join("|",@dot_keys);
719 $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
720
721 $dot .= qq{$parent -> "$node_name";\n} if ($parent);
722
723 $dot;
724 }
725
726 =head3 to_jsfind
727
728 Create jsFind xml files
729
730 my $nr=$tree->to_jsfind('/path/to/index','0');
731
732 Returns number of elements created
733
734 =cut
735
736 sub to_jsfind {
737 my $self = shift;
738 my ($path,$file) = @_;
739
740 return 0 if $self->is_empty;
741
742 confess("path is undefined.") unless ($path);
743 confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
744
745 my $nr_keys = 0;
746
747 my ($k, $d, $s) = @$self;
748 my $N = $self->size;
749
750 my ($key_xml, $data_xml) = ("<n>","<d>");
751
752 for (my $i = 0; $i <= $N; $i++) {
753 my $key = lc($k->[$i]);
754
755 if ($key) {
756 $key_xml .= qq{<k>$key</k>};
757 $data_xml .= qq{<e>};
758 #use Data::Dumper;
759 #print Dumper($d->[$i]);
760 foreach my $path (keys %{$d->[$i]}) {
761 $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';
762 $nr_keys++;
763 }
764 $data_xml .= qq{</e>};
765 }
766
767 $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
768 }
769
770 $key_xml .= "</n>";
771 $data_xml .= "</d>";
772
773 if (! -e $path) {
774 mkpath($path) || croak "can't create dir '$path': $!";
775 }
776
777 open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
778 open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
779
780 print K $self->SUPER::_recode($key_xml);
781 print D $self->SUPER::_recode($data_xml);
782
783 close(K);
784 close(D);
785
786 return $nr_keys;
787 }
788
789 1;
790 __END__
791
792 =head1 SEE ALSO
793
794 jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
795
796 B-Trees in perl web site L<http://perl.plover.com/BTree/>
797
798 =head1 AUTHORS
799
800 Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
801 base for this module
802
803 Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
804
805 Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
806
807 =head1 COPYRIGHT AND LICENSE
808
809 Copyright (C) 2004 by Dobrica Pavlinusic
810
811 This program is free software; you can redistribute it and/or modify it
812 under the terms of the GNU General Public License as published by the Free
813 Software Foundation; either version 2 of the License, or (at your option)
814 any later version. This program is distributed in the hope that it will be
815 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
816 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
817 Public License for more details.
818
819 =cut

  ViewVC Help
Powered by ViewVC 1.1.26