/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (show annotations)
Mon Jul 26 20:30:12 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 17354 byte(s)
to_jsfind will try to decode entities from data, and recode then to
target encoding (UTF-8 by default)

1 package jsFind;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6 use HTML::Entities;
7
8 our $VERSION = '0.03';
9
10 =head1 NAME
11
12 jsFind - generate index for jsFind using B-Tree
13
14 =head1 SYNOPSIS
15
16 use jsFind;
17
18
19
20 =head1 DESCRIPTION
21
22 This module can be used to create index files for jsFind, powerful tool for
23 adding a search engine to a CDROM archive or catalog without requiring the
24 user to install anything.
25
26 Main difference between this module and scripts delivered with jsFind are:
27
28 =over 5
29
30 =item *
31
32 You don't need to use swish-e to create index
33
34 =item *
35
36 You can programatically (and incrementaly) create index for jsFind
37
38 =back
39
40 =head1 METHODS
41
42 This module contains two packages C<jsFind> and C<jsFind::Node>.
43
44 =head2 jsFind methods
45
46 =cut
47
48 use Exporter 'import';
49 use Carp;
50
51 our @ISA = qw(Exporter);
52
53 BEGIN {
54 import 'jsFind::Node';
55 }
56
57 =head3 new
58
59 Create new tree. Arguments are C<B> which is maximum numbers of keys in
60 each node and optional C<Root> node. Each root node may have child nodes.
61
62 All nodes are objects from C<jsFind::Node>.
63
64 my $t = new jsFind(B => 4);
65
66 =cut
67
68 my $DEBUG = 1;
69
70 sub new {
71 my $package = shift;
72 my %ARGV = @_;
73 croak "Usage: {$package}::new(B => number [, Root => root node ])"
74 unless exists $ARGV{B};
75 if ($ARGV{B} % 2) {
76 my $B = $ARGV{B} + 1;
77 carp "B must be an even number. Using $B instead.";
78 $ARGV{B} = $B;
79 }
80
81 my $B = $ARGV{B};
82 my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
83 bless { B => $B, Root => $Root } => $package;
84 }
85
86 =head3 B_search
87
88 Search, insert, append or replace data in B-Tree
89
90 $t->B_search(
91 Key => 'key value',
92 Data => { "path" => {
93 "t" => "title of document",
94 "f" => 99,
95 },
96 },
97 Insert => 1,
98 Append => 1,
99 );
100
101 Semantics:
102
103 If key not found, insert it iff C<Insert> argument is present.
104
105 If key B<is> found, replace existing data iff C<Replace> argument
106 is present or add new datum to existing iff C<Append> argument is present.
107
108 =cut
109
110 sub B_search {
111 my $self = shift;
112 my %args = @_;
113 my $cur_node = $self->root;
114 my $k = $args{Key};
115 my $d = $args{Data};
116 my @path;
117
118 if ($cur_node->is_empty) { # Special case for empty root
119 if ($args{Insert}) {
120 $cur_node->kdp_insert($k => $d);
121 return $d;
122 } else {
123 return undef;
124 }
125 }
126
127 # Descend tree to leaf
128 for (;;) {
129
130 # Didn't hit bottom yet.
131
132 my($there, $where) = $cur_node->locate_key($k);
133 if ($there) { # Found it!
134 if ($args{Replace}) {
135 $cur_node->kdp_replace($where, $k => $d);
136 } elsif ($args{Append}) {
137 $cur_node->kdp_append($where, $k => $d);
138 }
139 return $cur_node->data($where);
140 }
141
142 # Not here---must be in a subtree.
143
144 if ($cur_node->is_leaf) { # But there are no subtrees
145 return undef unless $args{Insert}; # Search failed
146 # Stuff it in
147 $cur_node->kdp_insert($k => $d);
148 if ($self->node_overfull($cur_node)) { # Oops--there was no room.
149 $self->split_and_promote($cur_node, @path);
150 }
151 return $d;
152 }
153
154 # There are subtrees, and the key is in one of them.
155
156 push @path, [$cur_node, $where]; # Record path from root.
157
158 # Move down to search the subtree
159 $cur_node = $cur_node->subnode($where);
160
161 # and start over.
162 } # for (;;) ...
163
164 croak ("How did I get here?");
165 }
166
167
168
169 sub split_and_promote_old {
170 my $self = shift;
171 my ($cur_node, @path) = @_;
172
173 for (;;) {
174 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
175 my ($up, $where) = @{pop @path};
176 if ($up) {
177 $up->kdp_insert(@$kdp);
178 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
179 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
180 unless $tthere;
181 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
182 unless $twhere == $where;
183 $up->subnode($where, $newleft);
184 $up->subnode($where+1, $newright);
185 return unless $self->node_overfull($up);
186 $cur_node = $up;
187 } else { # We're at the top; make a new root.
188 my $newroot = new jsFind::Node ([$kdp->[0]],
189 [$kdp->[1]],
190 [$newleft, $newright]);
191 $self->root($newroot);
192 return;
193 }
194 }
195
196 }
197
198 sub split_and_promote {
199 my $self = shift;
200 my ($cur_node, @path) = @_;
201
202 for (;;) {
203 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
204 my ($up, $where) = @{pop @path} if (@path);
205 if ($up) {
206 $up->kdp_insert(@$kdp);
207 if ($DEBUG) {
208 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
209 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
210 unless $tthere;
211 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
212 unless $twhere == $where;
213 }
214 $up->subnode($where, $newleft);
215 $up->subnode($where+1, $newright);
216 return unless $self->node_overfull($up);
217 $cur_node = $up;
218 } else { # We're at the top; make a new root.
219 my $newroot = new jsFind::Node([$kdp->[0]],
220 [$kdp->[1]],
221 [$newleft, $newright]);
222 $self->root($newroot);
223 return;
224 }
225 }
226 }
227
228 =head3 B
229
230 Return B (maximum number of keys)
231
232 my $max_size = $t->B;
233
234 =cut
235
236 sub B {
237 $_[0]{B};
238 }
239
240 =head3 root
241
242 Returns root node
243
244 my $root = $t->root;
245
246 =cut
247
248 sub root {
249 my ($self, $newroot) = @_;
250 $self->{Root} = $newroot if defined $newroot;
251 $self->{Root};
252 }
253
254 =head3 node_overfull
255
256 Returns if node is overfull
257
258 if ($node->node_overfull) { something }
259
260 =cut
261
262 sub node_overfull {
263 my $self = shift;
264 my $node = shift;
265 $node->size > $self->B;
266 }
267
268 =head3 to_string
269
270 Returns your tree as formatted string.
271
272 my $text = $root->to_string;
273
274 Mostly usefull for debugging as output leaves much to be desired.
275
276 =cut
277
278 sub to_string {
279 $_[0]->root->to_string;
280 }
281
282 =head3 to_dot
283
284 Create Graphviz graph of your tree
285
286 my $dot_graph = $root->to_dot;
287
288 =cut
289
290 sub to_dot {
291 my $self = shift;
292
293 my $dot = qq/digraph dns {\nrankdir=LR;\n/;
294 $dot .= $self->root->to_dot;
295 $dot .= qq/\n}\n/;
296
297 return $dot;
298 }
299
300 =head3 to_jsfind
301
302 Create xml index files for jsFind. This should be called after
303 your B-Tree has been filled with data.
304
305 $root->to_jsfind('/full/path/to/index/dir/');
306
307 Returns number of nodes in created tree.
308
309 There is also longer version if you want to recode your data charset
310 into different one (probably UTF-8):
311
312 $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
313
314 Destination encoding is UTF-8 by default, so you don't have to specify it.
315
316 $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');
317
318 =cut
319
320 my $iconv;
321 my $iconv_l1;
322
323 sub to_jsfind {
324 my $self = shift;
325
326 my $path = shift || confess "to_jsfind need path to your index!";
327
328 my ($from_cp,$to_cp) = @_;
329
330 $to_cp ||= 'UTF-8';
331
332 if ($from_cp && $to_cp) {
333 $iconv = Text::Iconv->new($from_cp,$to_cp);
334 }
335 $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);
336
337 $path .= "/" if ($path =~ /\/$/);
338 #carp "creating directory for index '$path'" if (! -w $path);
339
340 return $self->root->to_jsfind($path,"0");
341 }
342
343
344 # private, default cmd function
345 sub default_cmp {
346 $_[0] cmp $_[1];
347 }
348
349 =head3 _recode
350
351 This is internal function to recode charset.
352
353 It will also try to decode HTML::Entities in data.
354
355 =cut
356
357 sub _recode {
358 my $self = shift;
359 my $text = shift || return;
360
361 sub _decode_html_entities {
362 my $data = shift || return;
363 $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
364 }
365
366 if ($iconv) {
367 $text = $iconv->convert($text) || $text && carp "convert problem: $text";
368 $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
369 }
370
371 return $text;
372 }
373
374 #####################################################################
375
376 =head2 jsFind::Node methods
377
378 Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
379 each has C<k+1> subnodes, which might be null.
380
381 The node is a blessed reference to a list with three elements:
382
383 ($keylist, $datalist, $subnodelist)
384
385 each is a reference to a list list.
386
387 The null node is represented by a blessed reference to an empty list.
388
389 =cut
390
391 package jsFind::Node;
392
393 use warnings;
394 use strict;
395
396 use Carp;
397 use File::Path;
398 use Text::Iconv;
399
400 use base 'jsFind';
401
402 my $KEYS = 0;
403 my $DATA = 1;
404 my $SUBNODES = 2;
405
406 =head3 new
407
408 Create New node
409
410 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
411
412 You can also mit argument list to create empty node.
413
414 my $empty_node = new jsFind::Node;
415
416 =cut
417
418 sub new {
419 my $self = shift;
420 my $package = ref $self || $self;
421 croak "Internal error: jsFind::Node::new called with wrong number of arguments."
422 unless @_ == 3 || @_ == 0;
423 bless [@_] => $package;
424 }
425
426 =head3 locate_key
427
428 Locate key in node using linear search. This should probably be replaced
429 by binary search for better performance.
430
431 my ($found, $index) = $node->locate_key($key, $cmp_coderef);
432
433 Argument C<$cmp_coderef> is optional reference to custom comparison
434 operator.
435
436 Returns (1, $index) if $key[$index] eq $key.
437
438 Returns (0, $index) if key could be found in $subnode[$index].
439
440 In scalar context, just returns 1 or 0.
441
442 =cut
443
444 sub locate_key {
445 # Use linear search for testing, replace with binary search.
446 my $self = shift;
447 my $key = shift;
448 my $cmp = shift || \&jsFind::default_cmp;
449 my $i;
450 my $cmp_result;
451 my $N = $self->size;
452 for ($i = 0; $i < $N; $i++) {
453 $cmp_result = &$cmp($key, $self->key($i));
454 last if $cmp_result <= 0;
455 }
456
457 # $i is now the index of the first node-key greater than $key
458 # or $N if there is no such. $cmp_result is 0 iff the key was found.
459 (!$cmp_result, $i);
460 }
461
462
463 =head3 emptynode
464
465 Creates new empty node
466
467 $node = $root->emptynode;
468 $new_node = $node->emptynode;
469
470 =cut
471
472 sub emptynode {
473 new($_[0]); # Pass package name, but not anything else.
474 }
475
476 =head3 is_empty
477
478 Test if node is empty
479
480 if ($node->is_empty) { something }
481
482 =cut
483
484 # undef is empty; so is a blessed empty list.
485 sub is_empty {
486 my $self = shift;
487 !defined($self) || $#$self < 0;
488 }
489
490 =head3 key
491
492 Return C<$i>th key from node
493
494 my $key = $node->key($i);
495
496 =cut
497
498 sub key {
499 # my ($self, $n) = @_;
500 # $self->[$KEYS][$n];
501
502 # speedup
503 $_[0]->[$KEYS][$_[1]];
504 }
505
506 =head3 data
507
508 Return C<$i>th data from node
509
510 my $data = $node->data($i);
511
512 =cut
513
514 sub data {
515 my ($self, $n) = @_;
516 $self->[$DATA][$n];
517 }
518
519 =head3 kdp_replace
520
521 Set key data pair for C<$i>th element in node
522
523 $node->kdp_replace($i, "key value" => {
524 "data key 1" => "data value 1",
525 "data key 2" => "data value 2",
526 };
527
528 =cut
529
530 sub kdp_replace {
531 my ($self, $n, $k => $d) = @_;
532 if (defined $k) {
533 $self->[$KEYS][$n] = $k;
534 $self->[$DATA][$n] = $d;
535 }
536 [$self->[$KEYS][$n],
537 $self->[$DATA][$n]];
538 }
539
540 =head3 kdp_insert
541
542 # No return value.
543
544 =cut
545
546 sub kdp_insert {
547 my $self = shift;
548 my ($k => $d) = @_;
549 my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
550
551 if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
552
553 # undef fix
554 $where ||= 0;
555
556 splice(@{$self->[$KEYS]}, $where, 0, $k);
557 splice(@{$self->[$DATA]}, $where, 0, $d);
558 splice(@{$self->[$SUBNODES]}, $where, 0, undef);
559 }
560
561 =head3 kdp_append
562
563 Adds new data keys and values to C<$i>th element in node
564
565 $node->kdp_append($i, "key value" => {
566 "added data key" => "added data value",
567 };
568
569 =cut
570
571 sub kdp_append {
572 my ($self, $n, $k => $d) = @_;
573 if (defined $k) {
574 $self->[$KEYS][$n] = $k;
575 my ($kv,$dv) = %{$d};
576 $self->[$DATA][$n]->{$kv} = $dv;
577 }
578 [$self->[$KEYS][$n],
579 $self->[$DATA][$n]];
580 }
581
582 =head3 subnode
583
584 Set new or return existing subnode
585
586 # return 4th subnode
587 my $my_node = $node->subnode(4);
588
589 # create new subnode 5 from $my_node
590 $node->subnode(5, $my_node);
591
592 =cut
593
594 sub subnode {
595 my ($self, $n, $newnode) = @_;
596 $self->[$SUBNODES][$n] = $newnode if defined $newnode;
597 $self->[$SUBNODES][$n];
598 }
599
600 =head3 is_leaf
601
602 Test if node is leaf
603
604 if ($node->is_leaf) { something }
605
606 =cut
607
608 sub is_leaf {
609 my $self = shift;
610 ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
611 }
612
613 =head3 size
614
615 Return number of keys in the node
616
617 my $nr = $node->size;
618
619 =cut
620
621 sub size {
622 my $self = shift;
623 return scalar(@{$self->[$KEYS]});
624 }
625
626 =head3 halves
627
628 # Accept an index $n
629 # Divide into two nodes so that keys 0 .. $n-1 are in one node
630 # and keys $n+1 ... $size are in the other.
631
632 =cut
633
634 sub halves {
635 my $self = shift;
636 my $n = shift;
637 my $s = $self->size;
638 my @right;
639 my @left;
640
641 $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
642 $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
643 $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
644
645 $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
646 $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
647 $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
648
649 my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
650
651 ($self->new(@left), $self->new(@right), \@middle);
652 }
653
654 =head3 to_string
655
656 Dumps tree as string
657
658 my $str = $root->to_string;
659
660 =cut
661
662 sub to_string {
663 my $self = shift;
664 my $indent = shift || 0;
665 my $I = ' ' x $indent;
666 return '' if $self->is_empty;
667 my ($k, $d, $s) = @$self;
668 my $result = '';
669 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
670 my $N = $self->size;
671 my $i;
672 for ($i = 0; $i < $N; $i++) {
673 # $result .= $I . "$k->[$i] => $d->[$i]\n";
674 $result .= $I . "$k->[$i]\n";
675 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
676 }
677 $result;
678 }
679
680 =begin comment
681
682 use Data::Dumper;
683
684 sub to_string {
685 my $self = shift;
686 my $indent = shift || 0;
687 my $path = shift || '0';
688 return '' if $self->is_empty;
689 my ($k, $d, $s) = @$self;
690 my $result = '';
691 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
692 my $N = $self->size;
693 for (my $i = 0; $i < $N; $i++) {
694 my $dump = Dumper($d->[$i]);
695 $dump =~ s/[\n\r\s]+/ /gs;
696 $dump =~ s/\$VAR1\s*=\s*//;
697 $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
698 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
699 }
700 $result;
701 }
702
703 =end comment
704
705 =head3 to_dot
706
707 Recursivly walk nodes of tree
708
709 =cut
710
711 sub to_dot {
712 my $self = shift;
713 my $parent = shift;
714
715 return '' if $self->is_empty;
716
717 my $dot = '';
718
719 my ($k, $d, $s) = @$self;
720 my $N = $self->size;
721
722 my @dot_keys;
723
724 my $node_name = $parent || '_';
725 $node_name =~ s/\W+//g;
726 $node_name .= " [$N]";
727
728 for (my $i = 0; $i <= $N; $i++) {
729 if (my $key = $k->[$i]) {
730 push @dot_keys, qq{<$i>$key};
731 }
732 $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
733 }
734 push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
735
736 my $label = join("|",@dot_keys);
737 $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
738
739 $dot .= qq{$parent -> "$node_name";\n} if ($parent);
740
741 $dot;
742 }
743
744 =head3 to_xml
745
746 Escape <, >, & and ", and to produce valid XML
747
748 =cut
749
750 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
751 my $escape_re = join '|' => keys %escape;
752
753 sub to_xml {
754 my $self = shift || confess "you should call to_xml as object!";
755
756 my $d = shift || return;
757 $d = $self->SUPER::_recode($d);
758 confess "escape_re undefined!" unless ($escape_re);
759 $d =~ s/($escape_re)/$escape{$1}/g;
760 return $d;
761 }
762
763 =head3 to_jsfind
764
765 Create jsFind xml files
766
767 my $nr=$tree->to_jsfind('/path/to/index','0');
768
769 Returns number of elements created
770
771 =cut
772
773
774 sub to_jsfind {
775 my $self = shift;
776 my ($path,$file) = @_;
777
778 return 0 if $self->is_empty;
779
780 confess("path is undefined.") unless ($path);
781 confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
782
783 my $nr_keys = 0;
784
785 my ($k, $d, $s) = @$self;
786 my $N = $self->size;
787
788 my ($key_xml, $data_xml) = ("<n>","<d>");
789
790 for (my $i = 0; $i <= $N; $i++) {
791 my $key = lc($k->[$i]);
792
793 if ($key) {
794 $key_xml .= '<k>'.$self->to_xml($key).'</k>';
795 $data_xml .= '<e>';
796 #use Data::Dumper;
797 #print Dumper($d->[$i]);
798 foreach my $path (keys %{$d->[$i]}) {
799 $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
800 $nr_keys++;
801 }
802 $data_xml .= '</e>';
803 }
804
805 $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
806 }
807
808 $key_xml .= '</n>';
809 $data_xml .= '</d>';
810
811 if (! -e $path) {
812 mkpath($path) || croak "can't create dir '$path': $!";
813 }
814
815 open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
816 open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
817
818 print K $key_xml;
819 print D $data_xml;
820
821 close(K);
822 close(D);
823
824 return $nr_keys;
825 }
826
827 1;
828 __END__
829
830 =head1 SEE ALSO
831
832 jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
833
834 B-Trees in perl web site L<http://perl.plover.com/BTree/>
835
836 =head1 AUTHORS
837
838 Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
839 base for this module
840
841 Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
842
843 Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
844
845 =head1 COPYRIGHT AND LICENSE
846
847 Copyright (C) 2004 by Dobrica Pavlinusic
848
849 This program is free software; you can redistribute it and/or modify it
850 under the terms of the GNU General Public License as published by the Free
851 Software Foundation; either version 2 of the License, or (at your option)
852 any later version. This program is distributed in the hope that it will be
853 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
854 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
855 Public License for more details.
856
857 =cut

  ViewVC Help
Powered by ViewVC 1.1.26