--- trunk/jsFind.pm 2004/07/11 20:18:25 1 +++ trunk/jsFind.pm 2004/08/28 14:31:58 12 @@ -3,8 +3,18 @@ use 5.008004; use strict; use warnings; +use HTML::Entities; -our $VERSION = '0.01'; +our $VERSION = '0.03'; + +use Exporter 'import'; +use Carp; + +our @ISA = qw(Exporter); + +BEGIN { + import 'jsFind::Node'; +} =head1 NAME @@ -13,8 +23,19 @@ =head1 SYNOPSIS use jsFind; - - + my $t = new jsFind(B => 4); + my $f = 1; + foreach my $k (qw{minima ut dolorem sapiente voluptatem}) { + $t->B_search(Key => $k, + Data => { + "path" => { + t => "word $k", + f => $f }, + }, + Insert => 1, + Append => 1, + ); + } =head1 DESCRIPTION @@ -36,24 +57,12 @@ =back -=head1 METHODS - -This module contains two packages C and C. - -=head2 jsFind methods - -=cut - -use Exporter 'import'; -use Carp; +=head1 jsFind methods -our @ISA = qw(Exporter); +C is mode implementing methods which you, the user, are going to +use to create indexes. -BEGIN { - import 'jsFind::Node'; -} - -=head3 new +=head2 new Create new tree. Arguments are C which is maximum numbers of keys in each node and optional C node. Each root node may have child nodes. @@ -82,11 +91,20 @@ bless { B => $B, Root => $Root } => $package; } -=head3 B_search +=head2 B_search Search, insert, append or replace data in B-Tree - + $t->B_search( + Key => 'key value', + Data => { "path" => { + "t" => "title of document", + "f" => 99, + }, + }, + Insert => 1, + Append => 1, + ); Semantics: @@ -215,7 +233,7 @@ } } -=head3 B +=head2 B Return B (maximum number of keys) @@ -227,7 +245,7 @@ $_[0]{B}; } -=head3 root +=head2 root Returns root node @@ -241,7 +259,7 @@ $self->{Root}; } -=head3 node_overfull +=head2 node_overfull Returns if node is overfull @@ -255,7 +273,7 @@ $node->size > $self->B; } -=head3 to_string +=head2 to_string Returns your tree as formatted string. @@ -269,7 +287,7 @@ $_[0]->root->to_string; } -=head3 to_dot +=head2 to_dot Create Graphviz graph of your tree @@ -287,7 +305,7 @@ return $dot; } -=head3 to_jsfind +=head2 to_jsfind Create xml index files for jsFind. This should be called after your B-Tree has been filled with data. @@ -296,15 +314,36 @@ Returns number of nodes in created tree. +There is also longer version if you want to recode your data charset +into different one (probably UTF-8): + + $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8'); + +Destination encoding is UTF-8 by default, so you don't have to specify it. + + $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250'); + =cut +my $iconv; +my $iconv_l1; + sub to_jsfind { my $self = shift; my $path = shift || confess "to_jsfind need path to your index!"; + my ($from_cp,$to_cp) = @_; + + $to_cp ||= 'UTF-8'; + + if ($from_cp && $to_cp) { + $iconv = Text::Iconv->new($from_cp,$to_cp); + } + $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp); + $path .= "/" if ($path =~ /\/$/); - carp "can't create index in '$path': $!" if (! -w $path); + #carp "creating directory for index '$path'" if (! -w $path); return $self->root->to_jsfind($path,"0"); } @@ -315,9 +354,34 @@ $_[0] cmp $_[1]; } +=head2 _recode + +This is internal function to recode charset. + +It will also try to decode entities in data using L. + +=cut + +sub _recode { + my $self = shift; + my $text = shift || return; + + sub _decode_html_entities { + my $data = shift || return; + $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data"; + } + + if ($iconv) { + $text = $iconv->convert($text) || $text && carp "convert problem: $text"; + $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges; + } + + return $text; +} + ##################################################################### -=head2 jsFind::Node methods +=head1 jsFind::Node methods Each node has C key-data pairs, with C <= C <= C<2B>, and each has C subnodes, which might be null. @@ -339,12 +403,15 @@ use Carp; use File::Path; +use Text::Iconv; + +use base 'jsFind'; my $KEYS = 0; my $DATA = 1; my $SUBNODES = 2; -=head3 new +=head2 new Create New node @@ -364,7 +431,7 @@ bless [@_] => $package; } -=head3 locate_key +=head2 locate_key Locate key in node using linear search. This should probably be replaced by binary search for better performance. @@ -401,7 +468,7 @@ } -=head3 emptynode +=head2 emptynode Creates new empty node @@ -414,7 +481,7 @@ new($_[0]); # Pass package name, but not anything else. } -=head3 is_empty +=head2 is_empty Test if node is empty @@ -428,7 +495,7 @@ !defined($self) || $#$self < 0; } -=head3 key +=head2 key Return C<$i>th key from node @@ -444,7 +511,7 @@ $_[0]->[$KEYS][$_[1]]; } -=head3 data +=head2 data Return C<$i>th data from node @@ -457,7 +524,7 @@ $self->[$DATA][$n]; } -=head3 kdp_replace +=head2 kdp_replace Set key data pair for C<$i>th element in node @@ -478,9 +545,13 @@ $self->[$DATA][$n]]; } -=head3 kdp_insert +=head2 kdp_insert + +Insert key/data pair in tree + + $node->kdp_insert("key value" => "data value"); - # No return value. +No return value. =cut @@ -499,7 +570,7 @@ splice(@{$self->[$SUBNODES]}, $where, 0, undef); } -=head3 kdp_append +=head2 kdp_append Adds new data keys and values to C<$i>th element in node @@ -520,7 +591,7 @@ $self->[$DATA][$n]]; } -=head3 subnode +=head2 subnode Set new or return existing subnode @@ -538,7 +609,7 @@ $self->[$SUBNODES][$n]; } -=head3 is_leaf +=head2 is_leaf Test if node is leaf @@ -551,7 +622,7 @@ ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node. } -=head3 size +=head2 size Return number of keys in the node @@ -564,11 +635,12 @@ return scalar(@{$self->[$KEYS]}); } -=head3 halves +=head2 halves - # Accept an index $n - # Divide into two nodes so that keys 0 .. $n-1 are in one node - # and keys $n+1 ... $size are in the other. +Split node into two halves so that keys C<0 .. $n-1> are in one node +and keys C<$n+1 ... $size> are in the other. + + my ($left_node, $right_node, $kdp) = $node->halves($n); =cut @@ -592,7 +664,7 @@ ($self->new(@left), $self->new(@right), \@middle); } -=head3 to_string +=head2 to_string Dumps tree as string @@ -643,7 +715,7 @@ =end comment -=head3 to_dot +=head2 to_dot Recursivly walk nodes of tree @@ -682,22 +754,45 @@ $dot; } -=head3 to_jsfind +=head2 to_xml + +Escape <, >, & and ", and to produce valid XML + +=cut + +my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); +my $escape_re = join '|' => keys %escape; + +sub to_xml { + my $self = shift || confess "you should call to_xml as object!"; + + my $d = shift || return; + $d = $self->SUPER::_recode($d); + confess "escape_re undefined!" unless ($escape_re); + $d =~ s/($escape_re)/$escape{$1}/g; + return $d; +} + +=head2 to_jsfind Create jsFind xml files - my $nr=$tree->to_dot('/path/to/index','0'); + my $nr=$tree->to_jsfind('/path/to/index','0'); Returns number of elements created =cut + sub to_jsfind { my $self = shift; my ($path,$file) = @_; return 0 if $self->is_empty; + confess("path is undefined.") unless ($path); + confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file)); + my $nr_keys = 0; my ($k, $d, $s) = @$self; @@ -709,22 +804,22 @@ my $key = lc($k->[$i]); if ($key) { - $key_xml .= qq{$key}; - $data_xml .= qq{}; + $key_xml .= ''.$self->to_xml($key).''; + $data_xml .= ''; #use Data::Dumper; #print Dumper($d->[$i]); foreach my $path (keys %{$d->[$i]}) { - $data_xml .= ''.$path.''; + $data_xml .= ''.$self->to_xml($path).''; $nr_keys++; } - $data_xml .= qq{}; + $data_xml .= ''; } $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]); } - $key_xml .= ""; - $data_xml .= ""; + $key_xml .= ''; + $data_xml .= ''; if (! -e $path) { mkpath($path) || croak "can't create dir '$path': $!";