/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (show annotations)
Sat Aug 28 15:19:22 2004 UTC (19 years, 8 months ago) by dpavlin
File size: 17911 byte(s)
final touches before first release to CPAN

1 package jsFind;
2
3 use 5.005;
4 use strict;
5 use warnings;
6 use HTML::Entities;
7
8 our $VERSION = '0.03';
9
10 use Exporter 'import';
11 use Carp;
12
13 our @ISA = qw(Exporter);
14
15 BEGIN {
16 import 'jsFind::Node';
17 }
18
19 =head1 NAME
20
21 jsFind - generate index for jsFind using B-Tree
22
23 =head1 SYNOPSIS
24
25 use jsFind;
26 my $t = new jsFind(B => 4);
27 my $f = 1;
28 foreach my $k (qw{minima ut dolorem sapiente voluptatem}) {
29 $t->B_search(Key => $k,
30 Data => {
31 "path" => {
32 t => "word $k",
33 f => $f },
34 },
35 Insert => 1,
36 Append => 1,
37 );
38 }
39
40 =head1 DESCRIPTION
41
42 This module can be used to create index files for jsFind, powerful tool for
43 adding a search engine to a CDROM archive or catalog without requiring the
44 user to install anything.
45
46 Main difference between this module and scripts delivered with jsFind are:
47
48 =over 5
49
50 =item *
51
52 You don't need to use swish-e to create index
53
54 =item *
55
56 You can programatically (and incrementaly) create index for jsFind
57
58 =back
59
60 You can also examine examples which come as tests with this module,
61 for example C<t/04words.t>.
62
63 =head1 jsFind methods
64
65 C<jsFind> is mode implementing methods which you, the user, are going to
66 use to create indexes.
67
68 =head2 new
69
70 Create new tree. Arguments are C<B> which is maximum numbers of keys in
71 each node and optional C<Root> node. Each root node may have child nodes.
72
73 All nodes are objects from C<jsFind::Node>.
74
75 my $t = new jsFind(B => 4);
76
77 =cut
78
79 my $DEBUG = 1;
80
81 sub new {
82 my $package = shift;
83 my %ARGV = @_;
84 croak "Usage: {$package}::new(B => number [, Root => root node ])"
85 unless exists $ARGV{B};
86 if ($ARGV{B} % 2) {
87 my $B = $ARGV{B} + 1;
88 carp "B must be an even number. Using $B instead.";
89 $ARGV{B} = $B;
90 }
91
92 my $B = $ARGV{B};
93 my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
94 bless { B => $B, Root => $Root } => $package;
95 }
96
97 =head2 B_search
98
99 Search, insert, append or replace data in B-Tree
100
101 $t->B_search(
102 Key => 'key value',
103 Data => { "path" => {
104 "t" => "title of document",
105 "f" => 99,
106 },
107 },
108 Insert => 1,
109 Append => 1,
110 );
111
112 Semantics:
113
114 If key not found, insert it iff C<Insert> argument is present.
115
116 If key B<is> found, replace existing data iff C<Replace> argument
117 is present or add new datum to existing iff C<Append> argument is present.
118
119 =cut
120
121 sub B_search {
122 my $self = shift;
123 my %args = @_;
124 my $cur_node = $self->root;
125 my $k = $args{Key};
126 my $d = $args{Data};
127 my @path;
128
129 if ($cur_node->is_empty) { # Special case for empty root
130 if ($args{Insert}) {
131 $cur_node->kdp_insert($k => $d);
132 return $d;
133 } else {
134 return undef;
135 }
136 }
137
138 # Descend tree to leaf
139 for (;;) {
140
141 # Didn't hit bottom yet.
142
143 my($there, $where) = $cur_node->locate_key($k);
144 if ($there) { # Found it!
145 if ($args{Replace}) {
146 $cur_node->kdp_replace($where, $k => $d);
147 } elsif ($args{Append}) {
148 $cur_node->kdp_append($where, $k => $d);
149 }
150 return $cur_node->data($where);
151 }
152
153 # Not here---must be in a subtree.
154
155 if ($cur_node->is_leaf) { # But there are no subtrees
156 return undef unless $args{Insert}; # Search failed
157 # Stuff it in
158 $cur_node->kdp_insert($k => $d);
159 if ($self->node_overfull($cur_node)) { # Oops--there was no room.
160 $self->split_and_promote($cur_node, @path);
161 }
162 return $d;
163 }
164
165 # There are subtrees, and the key is in one of them.
166
167 push @path, [$cur_node, $where]; # Record path from root.
168
169 # Move down to search the subtree
170 $cur_node = $cur_node->subnode($where);
171
172 # and start over.
173 } # for (;;) ...
174
175 croak ("How did I get here?");
176 }
177
178
179
180 sub split_and_promote_old {
181 my $self = shift;
182 my ($cur_node, @path) = @_;
183
184 for (;;) {
185 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
186 my ($up, $where) = @{pop @path};
187 if ($up) {
188 $up->kdp_insert(@$kdp);
189 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
190 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
191 unless $tthere;
192 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
193 unless $twhere == $where;
194 $up->subnode($where, $newleft);
195 $up->subnode($where+1, $newright);
196 return unless $self->node_overfull($up);
197 $cur_node = $up;
198 } else { # We're at the top; make a new root.
199 my $newroot = new jsFind::Node ([$kdp->[0]],
200 [$kdp->[1]],
201 [$newleft, $newright]);
202 $self->root($newroot);
203 return;
204 }
205 }
206
207 }
208
209 sub split_and_promote {
210 my $self = shift;
211 my ($cur_node, @path) = @_;
212
213 for (;;) {
214 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
215 my ($up, $where) = @{pop @path} if (@path);
216 if ($up) {
217 $up->kdp_insert(@$kdp);
218 if ($DEBUG) {
219 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
220 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
221 unless $tthere;
222 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
223 unless $twhere == $where;
224 }
225 $up->subnode($where, $newleft);
226 $up->subnode($where+1, $newright);
227 return unless $self->node_overfull($up);
228 $cur_node = $up;
229 } else { # We're at the top; make a new root.
230 my $newroot = new jsFind::Node([$kdp->[0]],
231 [$kdp->[1]],
232 [$newleft, $newright]);
233 $self->root($newroot);
234 return;
235 }
236 }
237 }
238
239 =head2 B
240
241 Return B (maximum number of keys)
242
243 my $max_size = $t->B;
244
245 =cut
246
247 sub B {
248 $_[0]{B};
249 }
250
251 =head2 root
252
253 Returns root node
254
255 my $root = $t->root;
256
257 =cut
258
259 sub root {
260 my ($self, $newroot) = @_;
261 $self->{Root} = $newroot if defined $newroot;
262 $self->{Root};
263 }
264
265 =head2 node_overfull
266
267 Returns if node is overfull
268
269 if ($node->node_overfull) { something }
270
271 =cut
272
273 sub node_overfull {
274 my $self = shift;
275 my $node = shift;
276 $node->size > $self->B;
277 }
278
279 =head2 to_string
280
281 Returns your tree as formatted string.
282
283 my $text = $root->to_string;
284
285 Mostly usefull for debugging as output leaves much to be desired.
286
287 =cut
288
289 sub to_string {
290 $_[0]->root->to_string;
291 }
292
293 =head2 to_dot
294
295 Create Graphviz graph of your tree
296
297 my $dot_graph = $root->to_dot;
298
299 =cut
300
301 sub to_dot {
302 my $self = shift;
303
304 my $dot = qq/digraph dns {\nrankdir=LR;\n/;
305 $dot .= $self->root->to_dot;
306 $dot .= qq/\n}\n/;
307
308 return $dot;
309 }
310
311 =head2 to_jsfind
312
313 Create xml index files for jsFind. This should be called after
314 your B-Tree has been filled with data.
315
316 $root->to_jsfind('/full/path/to/index/dir/');
317
318 Returns number of nodes in created tree.
319
320 There is also longer version if you want to recode your data charset
321 into different one (probably UTF-8):
322
323 $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
324
325 Destination encoding is UTF-8 by default, so you don't have to specify it.
326
327 $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');
328
329 =cut
330
331 my $iconv;
332 my $iconv_l1;
333
334 sub to_jsfind {
335 my $self = shift;
336
337 my $path = shift || confess "to_jsfind need path to your index!";
338
339 my ($from_cp,$to_cp) = @_;
340
341 $to_cp ||= 'UTF-8';
342
343 if ($from_cp && $to_cp) {
344 $iconv = Text::Iconv->new($from_cp,$to_cp);
345 }
346 $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);
347
348 $path .= "/" if ($path =~ /\/$/);
349 #carp "creating directory for index '$path'" if (! -w $path);
350
351 return $self->root->to_jsfind($path,"0");
352 }
353
354
355 # private, default cmd function
356 sub default_cmp {
357 $_[0] cmp $_[1];
358 }
359
360 =head2 _recode
361
362 This is internal function to recode charset.
363
364 It will also try to decode entities in data using L<HTML::Entities>.
365
366 =cut
367
368 sub _recode {
369 my $self = shift;
370 my $text = shift || return;
371
372 sub _decode_html_entities {
373 my $data = shift || return;
374 $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
375 }
376
377 if ($iconv) {
378 $text = $iconv->convert($text) || $text && carp "convert problem: $text";
379 $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
380 }
381
382 return $text;
383 }
384
385 #####################################################################
386
387 =head1 jsFind::Node methods
388
389 Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
390 each has C<k+1> subnodes, which might be null.
391
392 The node is a blessed reference to a list with three elements:
393
394 ($keylist, $datalist, $subnodelist)
395
396 each is a reference to a list list.
397
398 The null node is represented by a blessed reference to an empty list.
399
400 =cut
401
402 package jsFind::Node;
403
404 use warnings;
405 use strict;
406
407 use Carp;
408 use File::Path;
409 use Text::Iconv;
410
411 use base 'jsFind';
412
413 my $KEYS = 0;
414 my $DATA = 1;
415 my $SUBNODES = 2;
416
417 =head2 new
418
419 Create New node
420
421 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
422
423 You can also mit argument list to create empty node.
424
425 my $empty_node = new jsFind::Node;
426
427 =cut
428
429 sub new {
430 my $self = shift;
431 my $package = ref $self || $self;
432 croak "Internal error: jsFind::Node::new called with wrong number of arguments."
433 unless @_ == 3 || @_ == 0;
434 bless [@_] => $package;
435 }
436
437 =head2 locate_key
438
439 Locate key in node using linear search. This should probably be replaced
440 by binary search for better performance.
441
442 my ($found, $index) = $node->locate_key($key, $cmp_coderef);
443
444 Argument C<$cmp_coderef> is optional reference to custom comparison
445 operator.
446
447 Returns (1, $index) if $key[$index] eq $key.
448
449 Returns (0, $index) if key could be found in $subnode[$index].
450
451 In scalar context, just returns 1 or 0.
452
453 =cut
454
455 sub locate_key {
456 # Use linear search for testing, replace with binary search.
457 my $self = shift;
458 my $key = shift;
459 my $cmp = shift || \&jsFind::default_cmp;
460 my $i;
461 my $cmp_result;
462 my $N = $self->size;
463 for ($i = 0; $i < $N; $i++) {
464 $cmp_result = &$cmp($key, $self->key($i));
465 last if $cmp_result <= 0;
466 }
467
468 # $i is now the index of the first node-key greater than $key
469 # or $N if there is no such. $cmp_result is 0 iff the key was found.
470 (!$cmp_result, $i);
471 }
472
473
474 =head2 emptynode
475
476 Creates new empty node
477
478 $node = $root->emptynode;
479 $new_node = $node->emptynode;
480
481 =cut
482
483 sub emptynode {
484 new($_[0]); # Pass package name, but not anything else.
485 }
486
487 =head2 is_empty
488
489 Test if node is empty
490
491 if ($node->is_empty) { something }
492
493 =cut
494
495 # undef is empty; so is a blessed empty list.
496 sub is_empty {
497 my $self = shift;
498 !defined($self) || $#$self < 0;
499 }
500
501 =head2 key
502
503 Return C<$i>th key from node
504
505 my $key = $node->key($i);
506
507 =cut
508
509 sub key {
510 # my ($self, $n) = @_;
511 # $self->[$KEYS][$n];
512
513 # speedup
514 $_[0]->[$KEYS][$_[1]];
515 }
516
517 =head2 data
518
519 Return C<$i>th data from node
520
521 my $data = $node->data($i);
522
523 =cut
524
525 sub data {
526 my ($self, $n) = @_;
527 $self->[$DATA][$n];
528 }
529
530 =head2 kdp_replace
531
532 Set key data pair for C<$i>th element in node
533
534 $node->kdp_replace($i, "key value" => {
535 "data key 1" => "data value 1",
536 "data key 2" => "data value 2",
537 };
538
539 =cut
540
541 sub kdp_replace {
542 my ($self, $n, $k => $d) = @_;
543 if (defined $k) {
544 $self->[$KEYS][$n] = $k;
545 $self->[$DATA][$n] = $d;
546 }
547 [$self->[$KEYS][$n],
548 $self->[$DATA][$n]];
549 }
550
551 =head2 kdp_insert
552
553 Insert key/data pair in tree
554
555 $node->kdp_insert("key value" => "data value");
556
557 No return value.
558
559 =cut
560
561 sub kdp_insert {
562 my $self = shift;
563 my ($k => $d) = @_;
564 my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
565
566 if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
567
568 # undef fix
569 $where ||= 0;
570
571 splice(@{$self->[$KEYS]}, $where, 0, $k);
572 splice(@{$self->[$DATA]}, $where, 0, $d);
573 splice(@{$self->[$SUBNODES]}, $where, 0, undef);
574 }
575
576 =head2 kdp_append
577
578 Adds new data keys and values to C<$i>th element in node
579
580 $node->kdp_append($i, "key value" => {
581 "added data key" => "added data value",
582 };
583
584 =cut
585
586 sub kdp_append {
587 my ($self, $n, $k => $d) = @_;
588 if (defined $k) {
589 $self->[$KEYS][$n] = $k;
590 my ($kv,$dv) = %{$d};
591 $self->[$DATA][$n]->{$kv} = $dv;
592 }
593 [$self->[$KEYS][$n],
594 $self->[$DATA][$n]];
595 }
596
597 =head2 subnode
598
599 Set new or return existing subnode
600
601 # return 4th subnode
602 my $my_node = $node->subnode(4);
603
604 # create new subnode 5 from $my_node
605 $node->subnode(5, $my_node);
606
607 =cut
608
609 sub subnode {
610 my ($self, $n, $newnode) = @_;
611 $self->[$SUBNODES][$n] = $newnode if defined $newnode;
612 $self->[$SUBNODES][$n];
613 }
614
615 =head2 is_leaf
616
617 Test if node is leaf
618
619 if ($node->is_leaf) { something }
620
621 =cut
622
623 sub is_leaf {
624 my $self = shift;
625 ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
626 }
627
628 =head2 size
629
630 Return number of keys in the node
631
632 my $nr = $node->size;
633
634 =cut
635
636 sub size {
637 my $self = shift;
638 return scalar(@{$self->[$KEYS]});
639 }
640
641 =head2 halves
642
643 Split node into two halves so that keys C<0 .. $n-1> are in one node
644 and keys C<$n+1 ... $size> are in the other.
645
646 my ($left_node, $right_node, $kdp) = $node->halves($n);
647
648 =cut
649
650 sub halves {
651 my $self = shift;
652 my $n = shift;
653 my $s = $self->size;
654 my @right;
655 my @left;
656
657 $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
658 $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
659 $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
660
661 $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
662 $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
663 $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
664
665 my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
666
667 ($self->new(@left), $self->new(@right), \@middle);
668 }
669
670 =head2 to_string
671
672 Dumps tree as string
673
674 my $str = $root->to_string;
675
676 =cut
677
678 sub to_string {
679 my $self = shift;
680 my $indent = shift || 0;
681 my $I = ' ' x $indent;
682 return '' if $self->is_empty;
683 my ($k, $d, $s) = @$self;
684 my $result = '';
685 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
686 my $N = $self->size;
687 my $i;
688 for ($i = 0; $i < $N; $i++) {
689 # $result .= $I . "$k->[$i] => $d->[$i]\n";
690 $result .= $I . "$k->[$i]\n";
691 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
692 }
693 $result;
694 }
695
696 =begin comment
697
698 use Data::Dumper;
699
700 sub to_string {
701 my $self = shift;
702 my $indent = shift || 0;
703 my $path = shift || '0';
704 return '' if $self->is_empty;
705 my ($k, $d, $s) = @$self;
706 my $result = '';
707 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
708 my $N = $self->size;
709 for (my $i = 0; $i < $N; $i++) {
710 my $dump = Dumper($d->[$i]);
711 $dump =~ s/[\n\r\s]+/ /gs;
712 $dump =~ s/\$VAR1\s*=\s*//;
713 $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
714 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
715 }
716 $result;
717 }
718
719 =end comment
720
721 =head2 to_dot
722
723 Recursivly walk nodes of tree
724
725 =cut
726
727 sub to_dot {
728 my $self = shift;
729 my $parent = shift;
730
731 return '' if $self->is_empty;
732
733 my $dot = '';
734
735 my ($k, $d, $s) = @$self;
736 my $N = $self->size;
737
738 my @dot_keys;
739
740 my $node_name = $parent || '_';
741 $node_name =~ s/\W+//g;
742 $node_name .= " [$N]";
743
744 for (my $i = 0; $i <= $N; $i++) {
745 if (my $key = $k->[$i]) {
746 push @dot_keys, qq{<$i>$key};
747 }
748 $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
749 }
750 push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
751
752 my $label = join("|",@dot_keys);
753 $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
754
755 $dot .= qq{$parent -> "$node_name";\n} if ($parent);
756
757 $dot;
758 }
759
760 =head2 to_xml
761
762 Escape <, >, & and ", and to produce valid XML
763
764 =cut
765
766 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
767 my $escape_re = join '|' => keys %escape;
768
769 sub to_xml {
770 my $self = shift || confess "you should call to_xml as object!";
771
772 my $d = shift || return;
773 $d = $self->SUPER::_recode($d);
774 confess "escape_re undefined!" unless ($escape_re);
775 $d =~ s/($escape_re)/$escape{$1}/g;
776 return $d;
777 }
778
779 =head2 to_jsfind
780
781 Create jsFind xml files
782
783 my $nr=$tree->to_jsfind('/path/to/index','0');
784
785 Returns number of elements created
786
787 =cut
788
789
790 sub to_jsfind {
791 my $self = shift;
792 my ($path,$file) = @_;
793
794 return 0 if $self->is_empty;
795
796 confess("path is undefined.") unless ($path);
797 confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
798
799 my $nr_keys = 0;
800
801 my ($k, $d, $s) = @$self;
802 my $N = $self->size;
803
804 my ($key_xml, $data_xml) = ("<n>","<d>");
805
806 for (my $i = 0; $i <= $N; $i++) {
807 my $key = lc($k->[$i]);
808
809 if ($key) {
810 $key_xml .= '<k>'.$self->to_xml($key).'</k>';
811 $data_xml .= '<e>';
812 #use Data::Dumper;
813 #print Dumper($d->[$i]);
814 foreach my $path (keys %{$d->[$i]}) {
815 $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
816 $nr_keys++;
817 }
818 $data_xml .= '</e>';
819 }
820
821 $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
822 }
823
824 $key_xml .= '</n>';
825 $data_xml .= '</d>';
826
827 if (! -e $path) {
828 mkpath($path) || croak "can't create dir '$path': $!";
829 }
830
831 open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
832 open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
833
834 print K $key_xml;
835 print D $data_xml;
836
837 close(K);
838 close(D);
839
840 return $nr_keys;
841 }
842
843 1;
844 __END__
845
846 =head1 SEE ALSO
847
848 jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
849
850 B-Trees in perl web site L<http://perl.plover.com/BTree/>
851
852 This module web site L<http://www.rot13.org/~dpavlin/jsFind.html>
853
854 =head1 AUTHORS
855
856 Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
857 base for this module
858
859 Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
860
861 Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
862
863 =head1 COPYRIGHT AND LICENSE
864
865 Copyright (C) 2004 by Dobrica Pavlinusic
866
867 This program is free software; you can redistribute it and/or modify it
868 under the terms of the GNU General Public License as published by the Free
869 Software Foundation; either version 2 of the License, or (at your option)
870 any later version. This program is distributed in the hope that it will be
871 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
872 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
873 Public License for more details.
874
875 =cut

  ViewVC Help
Powered by ViewVC 1.1.26