/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (show annotations)
Sat Aug 28 14:31:58 2004 UTC (19 years, 8 months ago) by dpavlin
File size: 17750 byte(s)
documentation improvements

1 package jsFind;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6 use HTML::Entities;
7
8 our $VERSION = '0.03';
9
10 use Exporter 'import';
11 use Carp;
12
13 our @ISA = qw(Exporter);
14
15 BEGIN {
16 import 'jsFind::Node';
17 }
18
19 =head1 NAME
20
21 jsFind - generate index for jsFind using B-Tree
22
23 =head1 SYNOPSIS
24
25 use jsFind;
26 my $t = new jsFind(B => 4);
27 my $f = 1;
28 foreach my $k (qw{minima ut dolorem sapiente voluptatem}) {
29 $t->B_search(Key => $k,
30 Data => {
31 "path" => {
32 t => "word $k",
33 f => $f },
34 },
35 Insert => 1,
36 Append => 1,
37 );
38 }
39
40 =head1 DESCRIPTION
41
42 This module can be used to create index files for jsFind, powerful tool for
43 adding a search engine to a CDROM archive or catalog without requiring the
44 user to install anything.
45
46 Main difference between this module and scripts delivered with jsFind are:
47
48 =over 5
49
50 =item *
51
52 You don't need to use swish-e to create index
53
54 =item *
55
56 You can programatically (and incrementaly) create index for jsFind
57
58 =back
59
60 =head1 jsFind methods
61
62 C<jsFind> is mode implementing methods which you, the user, are going to
63 use to create indexes.
64
65 =head2 new
66
67 Create new tree. Arguments are C<B> which is maximum numbers of keys in
68 each node and optional C<Root> node. Each root node may have child nodes.
69
70 All nodes are objects from C<jsFind::Node>.
71
72 my $t = new jsFind(B => 4);
73
74 =cut
75
76 my $DEBUG = 1;
77
78 sub new {
79 my $package = shift;
80 my %ARGV = @_;
81 croak "Usage: {$package}::new(B => number [, Root => root node ])"
82 unless exists $ARGV{B};
83 if ($ARGV{B} % 2) {
84 my $B = $ARGV{B} + 1;
85 carp "B must be an even number. Using $B instead.";
86 $ARGV{B} = $B;
87 }
88
89 my $B = $ARGV{B};
90 my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
91 bless { B => $B, Root => $Root } => $package;
92 }
93
94 =head2 B_search
95
96 Search, insert, append or replace data in B-Tree
97
98 $t->B_search(
99 Key => 'key value',
100 Data => { "path" => {
101 "t" => "title of document",
102 "f" => 99,
103 },
104 },
105 Insert => 1,
106 Append => 1,
107 );
108
109 Semantics:
110
111 If key not found, insert it iff C<Insert> argument is present.
112
113 If key B<is> found, replace existing data iff C<Replace> argument
114 is present or add new datum to existing iff C<Append> argument is present.
115
116 =cut
117
118 sub B_search {
119 my $self = shift;
120 my %args = @_;
121 my $cur_node = $self->root;
122 my $k = $args{Key};
123 my $d = $args{Data};
124 my @path;
125
126 if ($cur_node->is_empty) { # Special case for empty root
127 if ($args{Insert}) {
128 $cur_node->kdp_insert($k => $d);
129 return $d;
130 } else {
131 return undef;
132 }
133 }
134
135 # Descend tree to leaf
136 for (;;) {
137
138 # Didn't hit bottom yet.
139
140 my($there, $where) = $cur_node->locate_key($k);
141 if ($there) { # Found it!
142 if ($args{Replace}) {
143 $cur_node->kdp_replace($where, $k => $d);
144 } elsif ($args{Append}) {
145 $cur_node->kdp_append($where, $k => $d);
146 }
147 return $cur_node->data($where);
148 }
149
150 # Not here---must be in a subtree.
151
152 if ($cur_node->is_leaf) { # But there are no subtrees
153 return undef unless $args{Insert}; # Search failed
154 # Stuff it in
155 $cur_node->kdp_insert($k => $d);
156 if ($self->node_overfull($cur_node)) { # Oops--there was no room.
157 $self->split_and_promote($cur_node, @path);
158 }
159 return $d;
160 }
161
162 # There are subtrees, and the key is in one of them.
163
164 push @path, [$cur_node, $where]; # Record path from root.
165
166 # Move down to search the subtree
167 $cur_node = $cur_node->subnode($where);
168
169 # and start over.
170 } # for (;;) ...
171
172 croak ("How did I get here?");
173 }
174
175
176
177 sub split_and_promote_old {
178 my $self = shift;
179 my ($cur_node, @path) = @_;
180
181 for (;;) {
182 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
183 my ($up, $where) = @{pop @path};
184 if ($up) {
185 $up->kdp_insert(@$kdp);
186 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
187 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
188 unless $tthere;
189 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
190 unless $twhere == $where;
191 $up->subnode($where, $newleft);
192 $up->subnode($where+1, $newright);
193 return unless $self->node_overfull($up);
194 $cur_node = $up;
195 } else { # We're at the top; make a new root.
196 my $newroot = new jsFind::Node ([$kdp->[0]],
197 [$kdp->[1]],
198 [$newleft, $newright]);
199 $self->root($newroot);
200 return;
201 }
202 }
203
204 }
205
206 sub split_and_promote {
207 my $self = shift;
208 my ($cur_node, @path) = @_;
209
210 for (;;) {
211 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
212 my ($up, $where) = @{pop @path} if (@path);
213 if ($up) {
214 $up->kdp_insert(@$kdp);
215 if ($DEBUG) {
216 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
217 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
218 unless $tthere;
219 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
220 unless $twhere == $where;
221 }
222 $up->subnode($where, $newleft);
223 $up->subnode($where+1, $newright);
224 return unless $self->node_overfull($up);
225 $cur_node = $up;
226 } else { # We're at the top; make a new root.
227 my $newroot = new jsFind::Node([$kdp->[0]],
228 [$kdp->[1]],
229 [$newleft, $newright]);
230 $self->root($newroot);
231 return;
232 }
233 }
234 }
235
236 =head2 B
237
238 Return B (maximum number of keys)
239
240 my $max_size = $t->B;
241
242 =cut
243
244 sub B {
245 $_[0]{B};
246 }
247
248 =head2 root
249
250 Returns root node
251
252 my $root = $t->root;
253
254 =cut
255
256 sub root {
257 my ($self, $newroot) = @_;
258 $self->{Root} = $newroot if defined $newroot;
259 $self->{Root};
260 }
261
262 =head2 node_overfull
263
264 Returns if node is overfull
265
266 if ($node->node_overfull) { something }
267
268 =cut
269
270 sub node_overfull {
271 my $self = shift;
272 my $node = shift;
273 $node->size > $self->B;
274 }
275
276 =head2 to_string
277
278 Returns your tree as formatted string.
279
280 my $text = $root->to_string;
281
282 Mostly usefull for debugging as output leaves much to be desired.
283
284 =cut
285
286 sub to_string {
287 $_[0]->root->to_string;
288 }
289
290 =head2 to_dot
291
292 Create Graphviz graph of your tree
293
294 my $dot_graph = $root->to_dot;
295
296 =cut
297
298 sub to_dot {
299 my $self = shift;
300
301 my $dot = qq/digraph dns {\nrankdir=LR;\n/;
302 $dot .= $self->root->to_dot;
303 $dot .= qq/\n}\n/;
304
305 return $dot;
306 }
307
308 =head2 to_jsfind
309
310 Create xml index files for jsFind. This should be called after
311 your B-Tree has been filled with data.
312
313 $root->to_jsfind('/full/path/to/index/dir/');
314
315 Returns number of nodes in created tree.
316
317 There is also longer version if you want to recode your data charset
318 into different one (probably UTF-8):
319
320 $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
321
322 Destination encoding is UTF-8 by default, so you don't have to specify it.
323
324 $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');
325
326 =cut
327
328 my $iconv;
329 my $iconv_l1;
330
331 sub to_jsfind {
332 my $self = shift;
333
334 my $path = shift || confess "to_jsfind need path to your index!";
335
336 my ($from_cp,$to_cp) = @_;
337
338 $to_cp ||= 'UTF-8';
339
340 if ($from_cp && $to_cp) {
341 $iconv = Text::Iconv->new($from_cp,$to_cp);
342 }
343 $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);
344
345 $path .= "/" if ($path =~ /\/$/);
346 #carp "creating directory for index '$path'" if (! -w $path);
347
348 return $self->root->to_jsfind($path,"0");
349 }
350
351
352 # private, default cmd function
353 sub default_cmp {
354 $_[0] cmp $_[1];
355 }
356
357 =head2 _recode
358
359 This is internal function to recode charset.
360
361 It will also try to decode entities in data using L<HTML::Entities>.
362
363 =cut
364
365 sub _recode {
366 my $self = shift;
367 my $text = shift || return;
368
369 sub _decode_html_entities {
370 my $data = shift || return;
371 $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
372 }
373
374 if ($iconv) {
375 $text = $iconv->convert($text) || $text && carp "convert problem: $text";
376 $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
377 }
378
379 return $text;
380 }
381
382 #####################################################################
383
384 =head1 jsFind::Node methods
385
386 Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
387 each has C<k+1> subnodes, which might be null.
388
389 The node is a blessed reference to a list with three elements:
390
391 ($keylist, $datalist, $subnodelist)
392
393 each is a reference to a list list.
394
395 The null node is represented by a blessed reference to an empty list.
396
397 =cut
398
399 package jsFind::Node;
400
401 use warnings;
402 use strict;
403
404 use Carp;
405 use File::Path;
406 use Text::Iconv;
407
408 use base 'jsFind';
409
410 my $KEYS = 0;
411 my $DATA = 1;
412 my $SUBNODES = 2;
413
414 =head2 new
415
416 Create New node
417
418 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
419
420 You can also mit argument list to create empty node.
421
422 my $empty_node = new jsFind::Node;
423
424 =cut
425
426 sub new {
427 my $self = shift;
428 my $package = ref $self || $self;
429 croak "Internal error: jsFind::Node::new called with wrong number of arguments."
430 unless @_ == 3 || @_ == 0;
431 bless [@_] => $package;
432 }
433
434 =head2 locate_key
435
436 Locate key in node using linear search. This should probably be replaced
437 by binary search for better performance.
438
439 my ($found, $index) = $node->locate_key($key, $cmp_coderef);
440
441 Argument C<$cmp_coderef> is optional reference to custom comparison
442 operator.
443
444 Returns (1, $index) if $key[$index] eq $key.
445
446 Returns (0, $index) if key could be found in $subnode[$index].
447
448 In scalar context, just returns 1 or 0.
449
450 =cut
451
452 sub locate_key {
453 # Use linear search for testing, replace with binary search.
454 my $self = shift;
455 my $key = shift;
456 my $cmp = shift || \&jsFind::default_cmp;
457 my $i;
458 my $cmp_result;
459 my $N = $self->size;
460 for ($i = 0; $i < $N; $i++) {
461 $cmp_result = &$cmp($key, $self->key($i));
462 last if $cmp_result <= 0;
463 }
464
465 # $i is now the index of the first node-key greater than $key
466 # or $N if there is no such. $cmp_result is 0 iff the key was found.
467 (!$cmp_result, $i);
468 }
469
470
471 =head2 emptynode
472
473 Creates new empty node
474
475 $node = $root->emptynode;
476 $new_node = $node->emptynode;
477
478 =cut
479
480 sub emptynode {
481 new($_[0]); # Pass package name, but not anything else.
482 }
483
484 =head2 is_empty
485
486 Test if node is empty
487
488 if ($node->is_empty) { something }
489
490 =cut
491
492 # undef is empty; so is a blessed empty list.
493 sub is_empty {
494 my $self = shift;
495 !defined($self) || $#$self < 0;
496 }
497
498 =head2 key
499
500 Return C<$i>th key from node
501
502 my $key = $node->key($i);
503
504 =cut
505
506 sub key {
507 # my ($self, $n) = @_;
508 # $self->[$KEYS][$n];
509
510 # speedup
511 $_[0]->[$KEYS][$_[1]];
512 }
513
514 =head2 data
515
516 Return C<$i>th data from node
517
518 my $data = $node->data($i);
519
520 =cut
521
522 sub data {
523 my ($self, $n) = @_;
524 $self->[$DATA][$n];
525 }
526
527 =head2 kdp_replace
528
529 Set key data pair for C<$i>th element in node
530
531 $node->kdp_replace($i, "key value" => {
532 "data key 1" => "data value 1",
533 "data key 2" => "data value 2",
534 };
535
536 =cut
537
538 sub kdp_replace {
539 my ($self, $n, $k => $d) = @_;
540 if (defined $k) {
541 $self->[$KEYS][$n] = $k;
542 $self->[$DATA][$n] = $d;
543 }
544 [$self->[$KEYS][$n],
545 $self->[$DATA][$n]];
546 }
547
548 =head2 kdp_insert
549
550 Insert key/data pair in tree
551
552 $node->kdp_insert("key value" => "data value");
553
554 No return value.
555
556 =cut
557
558 sub kdp_insert {
559 my $self = shift;
560 my ($k => $d) = @_;
561 my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
562
563 if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
564
565 # undef fix
566 $where ||= 0;
567
568 splice(@{$self->[$KEYS]}, $where, 0, $k);
569 splice(@{$self->[$DATA]}, $where, 0, $d);
570 splice(@{$self->[$SUBNODES]}, $where, 0, undef);
571 }
572
573 =head2 kdp_append
574
575 Adds new data keys and values to C<$i>th element in node
576
577 $node->kdp_append($i, "key value" => {
578 "added data key" => "added data value",
579 };
580
581 =cut
582
583 sub kdp_append {
584 my ($self, $n, $k => $d) = @_;
585 if (defined $k) {
586 $self->[$KEYS][$n] = $k;
587 my ($kv,$dv) = %{$d};
588 $self->[$DATA][$n]->{$kv} = $dv;
589 }
590 [$self->[$KEYS][$n],
591 $self->[$DATA][$n]];
592 }
593
594 =head2 subnode
595
596 Set new or return existing subnode
597
598 # return 4th subnode
599 my $my_node = $node->subnode(4);
600
601 # create new subnode 5 from $my_node
602 $node->subnode(5, $my_node);
603
604 =cut
605
606 sub subnode {
607 my ($self, $n, $newnode) = @_;
608 $self->[$SUBNODES][$n] = $newnode if defined $newnode;
609 $self->[$SUBNODES][$n];
610 }
611
612 =head2 is_leaf
613
614 Test if node is leaf
615
616 if ($node->is_leaf) { something }
617
618 =cut
619
620 sub is_leaf {
621 my $self = shift;
622 ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
623 }
624
625 =head2 size
626
627 Return number of keys in the node
628
629 my $nr = $node->size;
630
631 =cut
632
633 sub size {
634 my $self = shift;
635 return scalar(@{$self->[$KEYS]});
636 }
637
638 =head2 halves
639
640 Split node into two halves so that keys C<0 .. $n-1> are in one node
641 and keys C<$n+1 ... $size> are in the other.
642
643 my ($left_node, $right_node, $kdp) = $node->halves($n);
644
645 =cut
646
647 sub halves {
648 my $self = shift;
649 my $n = shift;
650 my $s = $self->size;
651 my @right;
652 my @left;
653
654 $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
655 $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
656 $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
657
658 $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
659 $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
660 $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
661
662 my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
663
664 ($self->new(@left), $self->new(@right), \@middle);
665 }
666
667 =head2 to_string
668
669 Dumps tree as string
670
671 my $str = $root->to_string;
672
673 =cut
674
675 sub to_string {
676 my $self = shift;
677 my $indent = shift || 0;
678 my $I = ' ' x $indent;
679 return '' if $self->is_empty;
680 my ($k, $d, $s) = @$self;
681 my $result = '';
682 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
683 my $N = $self->size;
684 my $i;
685 for ($i = 0; $i < $N; $i++) {
686 # $result .= $I . "$k->[$i] => $d->[$i]\n";
687 $result .= $I . "$k->[$i]\n";
688 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
689 }
690 $result;
691 }
692
693 =begin comment
694
695 use Data::Dumper;
696
697 sub to_string {
698 my $self = shift;
699 my $indent = shift || 0;
700 my $path = shift || '0';
701 return '' if $self->is_empty;
702 my ($k, $d, $s) = @$self;
703 my $result = '';
704 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
705 my $N = $self->size;
706 for (my $i = 0; $i < $N; $i++) {
707 my $dump = Dumper($d->[$i]);
708 $dump =~ s/[\n\r\s]+/ /gs;
709 $dump =~ s/\$VAR1\s*=\s*//;
710 $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
711 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
712 }
713 $result;
714 }
715
716 =end comment
717
718 =head2 to_dot
719
720 Recursivly walk nodes of tree
721
722 =cut
723
724 sub to_dot {
725 my $self = shift;
726 my $parent = shift;
727
728 return '' if $self->is_empty;
729
730 my $dot = '';
731
732 my ($k, $d, $s) = @$self;
733 my $N = $self->size;
734
735 my @dot_keys;
736
737 my $node_name = $parent || '_';
738 $node_name =~ s/\W+//g;
739 $node_name .= " [$N]";
740
741 for (my $i = 0; $i <= $N; $i++) {
742 if (my $key = $k->[$i]) {
743 push @dot_keys, qq{<$i>$key};
744 }
745 $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
746 }
747 push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
748
749 my $label = join("|",@dot_keys);
750 $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
751
752 $dot .= qq{$parent -> "$node_name";\n} if ($parent);
753
754 $dot;
755 }
756
757 =head2 to_xml
758
759 Escape <, >, & and ", and to produce valid XML
760
761 =cut
762
763 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
764 my $escape_re = join '|' => keys %escape;
765
766 sub to_xml {
767 my $self = shift || confess "you should call to_xml as object!";
768
769 my $d = shift || return;
770 $d = $self->SUPER::_recode($d);
771 confess "escape_re undefined!" unless ($escape_re);
772 $d =~ s/($escape_re)/$escape{$1}/g;
773 return $d;
774 }
775
776 =head2 to_jsfind
777
778 Create jsFind xml files
779
780 my $nr=$tree->to_jsfind('/path/to/index','0');
781
782 Returns number of elements created
783
784 =cut
785
786
787 sub to_jsfind {
788 my $self = shift;
789 my ($path,$file) = @_;
790
791 return 0 if $self->is_empty;
792
793 confess("path is undefined.") unless ($path);
794 confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
795
796 my $nr_keys = 0;
797
798 my ($k, $d, $s) = @$self;
799 my $N = $self->size;
800
801 my ($key_xml, $data_xml) = ("<n>","<d>");
802
803 for (my $i = 0; $i <= $N; $i++) {
804 my $key = lc($k->[$i]);
805
806 if ($key) {
807 $key_xml .= '<k>'.$self->to_xml($key).'</k>';
808 $data_xml .= '<e>';
809 #use Data::Dumper;
810 #print Dumper($d->[$i]);
811 foreach my $path (keys %{$d->[$i]}) {
812 $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
813 $nr_keys++;
814 }
815 $data_xml .= '</e>';
816 }
817
818 $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
819 }
820
821 $key_xml .= '</n>';
822 $data_xml .= '</d>';
823
824 if (! -e $path) {
825 mkpath($path) || croak "can't create dir '$path': $!";
826 }
827
828 open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
829 open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
830
831 print K $key_xml;
832 print D $data_xml;
833
834 close(K);
835 close(D);
836
837 return $nr_keys;
838 }
839
840 1;
841 __END__
842
843 =head1 SEE ALSO
844
845 jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
846
847 B-Trees in perl web site L<http://perl.plover.com/BTree/>
848
849 =head1 AUTHORS
850
851 Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
852 base for this module
853
854 Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
855
856 Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
857
858 =head1 COPYRIGHT AND LICENSE
859
860 Copyright (C) 2004 by Dobrica Pavlinusic
861
862 This program is free software; you can redistribute it and/or modify it
863 under the terms of the GNU General Public License as published by the Free
864 Software Foundation; either version 2 of the License, or (at your option)
865 any later version. This program is distributed in the hope that it will be
866 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
867 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
868 Public License for more details.
869
870 =cut

  ViewVC Help
Powered by ViewVC 1.1.26