jsFind/trunk/jsFind.pm

package jsFind;

use 5.008004;
use strict;
use warnings;
use HTML::Entities;

our $VERSION = '0.03';

=head1 NAME

jsFind - generate index for jsFind using B-Tree

=head1 SYNOPSIS

  use jsFind;


=head1 DESCRIPTION

This module can be used to create index files for jsFind, powerful tool for
adding a search engine to a CDROM archive or catalog without requiring the
user to install anything. 

Main difference between this module and scripts delivered with jsFind are:

=over 5

=item *

You don't need to use swish-e to create index

=item *

You can programatically (and incrementaly) create index for jsFind

=back

=head1 METHODS

This module contains two packages C<jsFind> and C<jsFind::Node>.

=head2 jsFind methods

=cut

use Exporter 'import';
use Carp;

our @ISA = qw(Exporter);

BEGIN {
        import 'jsFind::Node';
}

=head3 new

Create new tree. Arguments are C<B> which is maximum numbers of keys in
each node and optional C<Root> node. Each root node may have child nodes.

All nodes are objects from C<jsFind::Node>.

 my $t = new jsFind(B => 4);

=cut

my $DEBUG = 1;

sub new {
  my $package = shift;
  my %ARGV = @_;
  croak "Usage: {$package}::new(B => number [, Root => root node ])"
      unless exists $ARGV{B};
  if ($ARGV{B} % 2) {
    my $B = $ARGV{B} + 1;
    carp "B must be an even number.  Using $B instead.";
    $ARGV{B} = $B;
  }
    
  my $B = $ARGV{B};
  my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
  bless { B => $B, Root => $Root } => $package;
}

=head3 B_search

Search, insert, append or replace data in B-Tree

 $t->B_search(
        Key => 'key value',
        Data => { "path" => {
                        "t" => "title of document",
                        "f" => 99,
                        },
                },
        Insert => 1,
        Append => 1,
 );

Semantics:

If key not found, insert it iff C<Insert> argument is present.

If key B<is> found, replace existing data iff C<Replace> argument
is present or add new datum to existing iff C<Append> argument is present.

=cut

sub B_search {
  my $self = shift;
  my %args = @_;
  my $cur_node = $self->root;
  my $k = $args{Key};
  my $d = $args{Data};
  my @path;

  if ($cur_node->is_empty) {    # Special case for empty root
    if ($args{Insert}) {
      $cur_node->kdp_insert($k => $d);
      return $d;
    } else {
      return undef;
    }
  }

  # Descend tree to leaf
  for (;;) {

    # Didn't hit bottom yet.

    my($there, $where) = $cur_node->locate_key($k);
    if ($there) {               # Found it!
      if ($args{Replace}) {
        $cur_node->kdp_replace($where, $k => $d);
      } elsif ($args{Append}) {
        $cur_node->kdp_append($where, $k => $d);
      }
      return $cur_node->data($where);
    }
    
    # Not here---must be in a subtree.
    
    if ($cur_node->is_leaf) {   # But there are no subtrees
      return undef unless $args{Insert}; # Search failed
      # Stuff it in
      $cur_node->kdp_insert($k => $d);
      if ($self->node_overfull($cur_node)) { # Oops--there was no room.
        $self->split_and_promote($cur_node, @path);
      } 
      return $d;
    }

    # There are subtrees, and the key is in one of them.

    push @path, [$cur_node, $where];    # Record path from root.

    # Move down to search the subtree
    $cur_node = $cur_node->subnode($where);

    # and start over.
  }                             # for (;;) ...

  croak ("How did I get here?");
}


sub split_and_promote_old {
  my $self = shift;
  my ($cur_node, @path) = @_;
  
  for (;;) {
    my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
    my ($up, $where) = @{pop @path};
    if ($up) {
      $up->kdp_insert(@$kdp);
      my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
      croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
          unless $tthere;
      croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
          unless $twhere == $where;
      $up->subnode($where,   $newleft);
      $up->subnode($where+1, $newright);
      return unless $self->node_overfull($up);
      $cur_node = $up;
    } else { # We're at the top; make a new root.
      my $newroot = new jsFind::Node ([$kdp->[0]], 
                                     [$kdp->[1]], 
                                     [$newleft, $newright]);
      $self->root($newroot);
      return;
    }
  }
  
}

sub split_and_promote {
  my $self = shift;
  my ($cur_node, @path) = @_;
  
  for (;;) {
    my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
    my ($up, $where) = @{pop @path} if (@path);
    if ($up) {
      $up->kdp_insert(@$kdp);
      if ($DEBUG) {
        my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
        croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
          unless $tthere;
        croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
          unless $twhere == $where;
      }
      $up->subnode($where,   $newleft);
      $up->subnode($where+1, $newright);
      return unless $self->node_overfull($up);
      $cur_node = $up;
    } else { # We're at the top; make a new root.
      my $newroot = new jsFind::Node([$kdp->[0]], 
                                     [$kdp->[1]], 
                                     [$newleft, $newright]);
      $self->root($newroot);
      return;
    }
  }
}

=head3 B

Return B (maximum number of keys)

 my $max_size = $t->B;

=cut

sub B {
  $_[0]{B};
}

=head3 root

Returns root node

 my $root = $t->root;

=cut

sub root {
  my ($self, $newroot) = @_;
  $self->{Root} = $newroot if defined $newroot;
  $self->{Root};
}

=head3 node_overfull

Returns if node is overfull

 if ($node->node_overfull) { something }

=cut

sub node_overfull {
  my $self = shift;
  my $node = shift;
  $node->size > $self->B;
}

=head3 to_string

Returns your tree as formatted string.

 my $text = $root->to_string;

Mostly usefull for debugging as output leaves much to be desired.

=cut

sub to_string {
  $_[0]->root->to_string;
}

=head3 to_dot

Create Graphviz graph of your tree

 my $dot_graph = $root->to_dot;

=cut

sub to_dot {
        my $self = shift;

        my $dot = qq/digraph dns {\nrankdir=LR;\n/;
        $dot .= $self->root->to_dot;
        $dot .= qq/\n}\n/;

        return $dot;
}

=head3 to_jsfind

Create xml index files for jsFind. This should be called after
your B-Tree has been filled with data.

 $root->to_jsfind('/full/path/to/index/dir/');

Returns number of nodes in created tree.

There is also longer version if you want to recode your data charset
into different one (probably UTF-8):

 $root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');

Destination encoding is UTF-8 by default, so you don't have to specify it.

 $root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');

=cut

my $iconv;
my $iconv_l1;

sub to_jsfind {
        my $self = shift;

        my $path = shift || confess "to_jsfind need path to your index!";

        my ($from_cp,$to_cp) = @_;

        $to_cp ||= 'UTF-8';

        if ($from_cp && $to_cp) {
                $iconv = Text::Iconv->new($from_cp,$to_cp);
        }
        $iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);

        $path .= "/" if ($path =~ /\/$/);
        #carp "creating directory for index '$path'" if (! -w $path);

        return $self->root->to_jsfind($path,"0");
}


# private, default cmd function
sub default_cmp {
  $_[0] cmp $_[1];
}

=head3 _recode

This is internal function to recode charset.

It will also try to decode HTML::Entities in data.

=cut

sub _recode {
        my $self = shift;
        my $text = shift || return;

        sub _decode_html_entities {
                my $data = shift || return;
                $data = $iconv_l1->convert(decode_entities($data)) || croak "entity decode problem: $data";
        }

        if ($iconv) {
                $text = $iconv->convert($text) || $text && carp "convert problem: $text";
                $text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
        }

        return $text;
}

#####################################################################

=head2 jsFind::Node methods

Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and 
each has C<k+1> subnodes, which might be null.

The node is a blessed reference to a list with three elements:

  ($keylist, $datalist, $subnodelist)

each is a reference to a list list.

The null node is represented by a blessed reference to an empty list.

=cut

package jsFind::Node;

use warnings;
use strict;

use Carp;
use File::Path;
use Text::Iconv;

use base 'jsFind';

my $KEYS = 0;
my $DATA = 1;
my $SUBNODES = 2;

=head3 new

Create New node

 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);

You can also mit argument list to create empty node.

 my $empty_node = new jsFind::Node;

=cut

sub new {
  my $self = shift;
  my $package = ref $self || $self;
  croak "Internal error:  jsFind::Node::new called with wrong number of arguments."
      unless @_ == 3 || @_ == 0;
  bless [@_] => $package;
}

=head3 locate_key

Locate key in node using linear search. This should probably be replaced
by binary search for better performance.

 my ($found, $index) = $node->locate_key($key, $cmp_coderef);

Argument C<$cmp_coderef> is optional reference to custom comparison
operator.

Returns (1, $index) if $key[$index] eq $key.

Returns (0, $index) if key could be found in $subnode[$index].

In scalar context, just returns 1 or 0.

=cut

sub locate_key {
  # Use linear search for testing, replace with binary search.
  my $self = shift;
  my $key = shift;
  my $cmp = shift || \&jsFind::default_cmp;
  my $i;
  my $cmp_result;
  my $N = $self->size;
  for ($i = 0; $i < $N; $i++) {
    $cmp_result = &$cmp($key, $self->key($i));
    last if $cmp_result <= 0;
  }
  
  # $i is now the index of the first node-key greater than $key
  # or $N if there is no such.  $cmp_result is 0 iff the key was found.
  (!$cmp_result, $i);
}


=head3 emptynode

Creates new empty node

 $node = $root->emptynode;
 $new_node = $node->emptynode;

=cut

sub emptynode {
  new($_[0]);                   # Pass package name, but not anything else.
}

=head3 is_empty

Test if node is empty

 if ($node->is_empty) { something }

=cut

# undef is empty; so is a blessed empty list.
sub is_empty {
  my $self = shift;
  !defined($self) || $#$self < 0;
}

=head3 key

Return C<$i>th key from node

 my $key = $node->key($i);

=cut

sub key {
#  my ($self, $n) = @_;
#  $self->[$KEYS][$n];

#       speedup
   $_[0]->[$KEYS][$_[1]];
}

=head3 data

Return C<$i>th data from node

 my $data = $node->data($i);

=cut

sub data {
  my ($self, $n) = @_;
  $self->[$DATA][$n];
}

=head3 kdp_replace

Set key data pair for C<$i>th element in node

 $node->kdp_replace($i, "key value" => {
        "data key 1" => "data value 1",
        "data key 2" => "data value 2",
 };

=cut

sub kdp_replace {
  my ($self, $n, $k => $d) = @_;
  if (defined $k) {
    $self->[$KEYS][$n] = $k;
    $self->[$DATA][$n] = $d;
  }
  [$self->[$KEYS][$n], 
   $self->[$DATA][$n]];
}

=head3 kdp_insert

 # No return value.

=cut

sub kdp_insert {
  my $self = shift;
  my ($k => $d) = @_;
  my ($there, $where) = $self->locate_key($k) unless $self->is_empty;

  if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }

  # undef fix
  $where ||= 0;

  splice(@{$self->[$KEYS]}, $where, 0, $k);
  splice(@{$self->[$DATA]}, $where, 0, $d);
  splice(@{$self->[$SUBNODES]}, $where, 0, undef);
}

=head3 kdp_append

Adds new data keys and values to C<$i>th element in node

 $node->kdp_append($i, "key value" => {
        "added data key" => "added data value",
 };

=cut

sub kdp_append {
  my ($self, $n, $k => $d) = @_;
  if (defined $k) {
    $self->[$KEYS][$n] = $k;
    my ($kv,$dv) = %{$d};
    $self->[$DATA][$n]->{$kv} = $dv;
  }
  [$self->[$KEYS][$n], 
   $self->[$DATA][$n]];
}

=head3 subnode

Set new or return existing subnode

 # return 4th subnode
 my $my_node = $node->subnode(4);

 # create new subnode 5 from $my_node
 $node->subnode(5, $my_node);

=cut

sub subnode {
  my ($self, $n, $newnode) = @_;
  $self->[$SUBNODES][$n] = $newnode if defined $newnode;
  $self->[$SUBNODES][$n];
}

=head3 is_leaf

Test if node is leaf

 if ($node->is_leaf) { something }

=cut

sub is_leaf {
  my $self = shift;
  ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
}

=head3 size

Return number of keys in the node

 my $nr = $node->size;

=cut

sub size {
  my $self = shift;
  return scalar(@{$self->[$KEYS]});
}

=head3 halves

 # Accept an index $n
 # Divide into two nodes so that keys 0 .. $n-1 are in one node
 # and keys $n+1 ... $size are in the other.

=cut

sub halves {
  my $self = shift;
  my $n = shift;
  my $s = $self->size;
  my @right;
  my @left;

  $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
  $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
  $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];

  $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
  $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
  $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];

  my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);

  ($self->new(@left), $self->new(@right), \@middle);
}

=head3 to_string

Dumps tree as string

 my $str = $root->to_string;

=cut

sub to_string {
  my $self = shift;
  my $indent = shift || 0;
  my $I = ' ' x $indent;
  return '' if $self->is_empty;
  my ($k, $d, $s) = @$self;
  my $result = '';
  $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
  my $N = $self->size;
  my $i;
  for ($i = 0; $i < $N; $i++) {
#    $result .= $I . "$k->[$i] => $d->[$i]\n";
    $result .= $I . "$k->[$i]\n";
    $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
  }
  $result;
}

=begin comment

use Data::Dumper;

sub to_string {
  my $self = shift;
  my $indent = shift || 0;
  my $path = shift || '0';
  return '' if $self->is_empty;
  my ($k, $d, $s) = @$self;
  my $result = '';
  $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
  my $N = $self->size;
  for (my $i = 0; $i < $N; $i++) {
        my $dump = Dumper($d->[$i]);
        $dump =~ s/[\n\r\s]+/ /gs;
        $dump =~ s/\$VAR1\s*=\s*//;
    $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
    $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
  }
  $result;
}

=end comment

=head3 to_dot

Recursivly walk nodes of tree

=cut

sub to_dot {
        my $self = shift;
        my $parent = shift;

        return '' if $self->is_empty;

        my $dot = '';

        my ($k, $d, $s) = @$self;
        my $N = $self->size;

        my @dot_keys;

        my $node_name = $parent || '_';
        $node_name =~ s/\W+//g;
        $node_name .= " [$N]";

        for (my $i = 0; $i <= $N; $i++) {
                if (my $key = $k->[$i]) {
                        push @dot_keys, qq{<$i>$key};
                }
                $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
        }
        push @dot_keys, qq{<$N>...} if (! $self->is_leaf);

        my $label = join("|",@dot_keys);
        $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};

        $dot .= qq{$parent -> "$node_name";\n} if ($parent);

        $dot;
}

=head3 to_xml

Escape <, >, & and ", and to produce valid XML

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
        my $self = shift || confess "you should call to_xml as object!";

        my $d = shift || return;
        $d = $self->SUPER::_recode($d);
        confess "escape_re undefined!" unless ($escape_re);
        $d =~ s/($escape_re)/$escape{$1}/g;
        return $d;
}

=head3 to_jsfind

Create jsFind xml files

 my $nr=$tree->to_jsfind('/path/to/index','0');

Returns number of elements created

=cut


sub to_jsfind {
        my $self = shift;
        my ($path,$file) = @_;

        return 0 if $self->is_empty;

        confess("path is undefined.") unless ($path);
        confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));

        my $nr_keys = 0;

        my ($k, $d, $s) = @$self;
        my $N = $self->size;

        my ($key_xml, $data_xml) = ("<n>","<d>");

        for (my $i = 0; $i <= $N; $i++) {
                my $key = lc($k->[$i]);

                if ($key) {
                        $key_xml .= '<k>'.$self->to_xml($key).'</k>';
                        $data_xml .= '<e>';
        #use Data::Dumper;
        #print Dumper($d->[$i]);
                        foreach my $path (keys %{$d->[$i]}) {
                                $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} || 'no title').'">'.$self->to_xml($path).'</l>';
                                $nr_keys++;
                        }
                        $data_xml .= '</e>';
                }

                $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
        }

        $key_xml .= '</n>';
        $data_xml .= '</d>';

        if (! -e $path) {
                mkpath($path) || croak "can't create dir '$path': $!";
        }

        open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
        open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";

        print K $key_xml;
        print D $data_xml;

        close(K);
        close(D);

        return $nr_keys;
}

1;
__END__

=head1 SEE ALSO

jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>

B-Trees in perl web site L<http://perl.plover.com/BTree/>

=head1 AUTHORS

Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
base for this module

Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind

Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version. This program is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.

=cut
1	package jsFind;
2
3	use 5.008004;
4	use strict;
5	use warnings;
6	use HTML::Entities;
7
8	our $VERSION = '0.03';
9
10	=head1 NAME
11
12	jsFind - generate index for jsFind using B-Tree
13
14	=head1 SYNOPSIS
15
16	use jsFind;
17
18
19
20	=head1 DESCRIPTION
21
22	This module can be used to create index files for jsFind, powerful tool for
23	adding a search engine to a CDROM archive or catalog without requiring the
24	user to install anything.
25
26	Main difference between this module and scripts delivered with jsFind are:
27
28	=over 5
29
30	=item *
31
32	You don't need to use swish-e to create index
33
34	=item *
35
36	You can programatically (and incrementaly) create index for jsFind
37
38	=back
39
40	=head1 METHODS
41
42	This module contains two packages C<jsFind> and C<jsFind::Node>.
43
44	=head2 jsFind methods
45
46	=cut
47
48	use Exporter 'import';
49	use Carp;
50
51	our @ISA = qw(Exporter);
52
53	BEGIN {
54	import 'jsFind::Node';
55	}
56
57	=head3 new
58
59	Create new tree. Arguments are C<B> which is maximum numbers of keys in
60	each node and optional C<Root> node. Each root node may have child nodes.
61
62	All nodes are objects from C<jsFind::Node>.
63
64	my $t = new jsFind(B => 4);
65
66	=cut
67
68	my $DEBUG = 1;
69
70	sub new {
71	my $package = shift;
72	my %ARGV = @_;
73	croak "Usage: {$package}::new(B => number [, Root => root node ])"
74	unless exists $ARGV{B};
75	if ($ARGV{B} % 2) {
76	my $B = $ARGV{B} + 1;
77	carp "B must be an even number. Using $B instead.";
78	$ARGV{B} = $B;
79	}
80
81	my $B = $ARGV{B};
82	my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
83	bless { B => $B, Root => $Root } => $package;
84	}
85
86	=head3 B_search
87
88	Search, insert, append or replace data in B-Tree
89
90	$t->B_search(
91	Key => 'key value',
92	Data => { "path" => {
93	"t" => "title of document",
94	"f" => 99,
95	},
96	},
97	Insert => 1,
98	Append => 1,
99	);
100
101	Semantics:
102
103	If key not found, insert it iff C<Insert> argument is present.
104
105	If key B<is> found, replace existing data iff C<Replace> argument
106	is present or add new datum to existing iff C<Append> argument is present.
107
108	=cut
109
110	sub B_search {
111	my $self = shift;
112	my %args = @_;
113	my $cur_node = $self->root;
114	my $k = $args{Key};
115	my $d = $args{Data};
116	my @path;
117
118	if ($cur_node->is_empty) { # Special case for empty root
119	if ($args{Insert}) {
120	$cur_node->kdp_insert($k => $d);
121	return $d;
122	} else {
123	return undef;
124	}
125	}
126
127	# Descend tree to leaf
128	for (;;) {
129
130	# Didn't hit bottom yet.
131
132	my($there, $where) = $cur_node->locate_key($k);
133	if ($there) { # Found it!
134	if ($args{Replace}) {
135	$cur_node->kdp_replace($where, $k => $d);
136	} elsif ($args{Append}) {
137	$cur_node->kdp_append($where, $k => $d);
138	}
139	return $cur_node->data($where);
140	}
141
142	# Not here---must be in a subtree.
143
144	if ($cur_node->is_leaf) { # But there are no subtrees
145	return undef unless $args{Insert}; # Search failed
146	# Stuff it in
147	$cur_node->kdp_insert($k => $d);
148	if ($self->node_overfull($cur_node)) { # Oops--there was no room.
149	$self->split_and_promote($cur_node, @path);
150	}
151	return $d;
152	}
153
154	# There are subtrees, and the key is in one of them.
155
156	push @path, [$cur_node, $where]; # Record path from root.
157
158	# Move down to search the subtree
159	$cur_node = $cur_node->subnode($where);
160
161	# and start over.
162	} # for (;;) ...
163
164	croak ("How did I get here?");
165	}
166
167
168
169	sub split_and_promote_old {
170	my $self = shift;
171	my ($cur_node, @path) = @_;
172
173	for (;;) {
174	my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
175	my ($up, $where) = @{pop @path};
176	if ($up) {
177	$up->kdp_insert(@$kdp);
178	my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
179	croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
180	unless $tthere;
181	croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
182	unless $twhere == $where;
183	$up->subnode($where, $newleft);
184	$up->subnode($where+1, $newright);
185	return unless $self->node_overfull($up);
186	$cur_node = $up;
187	} else { # We're at the top; make a new root.
188	my $newroot = new jsFind::Node ([$kdp->[0]],
189	[$kdp->[1]],
190	[$newleft, $newright]);
191	$self->root($newroot);
192	return;
193	}
194	}
195
196	}
197
198	sub split_and_promote {
199	my $self = shift;
200	my ($cur_node, @path) = @_;
201
202	for (;;) {
203	my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
204	my ($up, $where) = @{pop @path} if (@path);
205	if ($up) {
206	$up->kdp_insert(@$kdp);
207	if ($DEBUG) {
208	my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
209	croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
210	unless $tthere;
211	croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
212	unless $twhere == $where;
213	}
214	$up->subnode($where, $newleft);
215	$up->subnode($where+1, $newright);
216	return unless $self->node_overfull($up);
217	$cur_node = $up;
218	} else { # We're at the top; make a new root.
219	my $newroot = new jsFind::Node([$kdp->[0]],
220	[$kdp->[1]],
221	[$newleft, $newright]);
222	$self->root($newroot);
223	return;
224	}
225	}
226	}
227
228	=head3 B
229
230	Return B (maximum number of keys)
231
232	my $max_size = $t->B;
233
234	=cut
235
236	sub B {
237	$_[0]{B};
238	}
239
240	=head3 root
241
242	Returns root node
243
244	my $root = $t->root;
245
246	=cut
247
248	sub root {
249	my ($self, $newroot) = @_;
250	$self->{Root} = $newroot if defined $newroot;
251	$self->{Root};
252	}
253
254	=head3 node_overfull
255
256	Returns if node is overfull
257
258	if ($node->node_overfull) { something }
259
260	=cut
261
262	sub node_overfull {
263	my $self = shift;
264	my $node = shift;
265	$node->size > $self->B;
266	}
267
268	=head3 to_string
269
270	Returns your tree as formatted string.
271
272	my $text = $root->to_string;
273
274	Mostly usefull for debugging as output leaves much to be desired.
275
276	=cut
277
278	sub to_string {
279	$_[0]->root->to_string;
280	}
281
282	=head3 to_dot
283
284	Create Graphviz graph of your tree
285
286	my $dot_graph = $root->to_dot;
287
288	=cut
289
290	sub to_dot {
291	my $self = shift;
292
293	my $dot = qq/digraph dns {\nrankdir=LR;\n/;
294	$dot .= $self->root->to_dot;
295	$dot .= qq/\n}\n/;
296
297	return $dot;
298	}
299
300	=head3 to_jsfind
301
302	Create xml index files for jsFind. This should be called after
303	your B-Tree has been filled with data.
304
305	$root->to_jsfind('/full/path/to/index/dir/');
306
307	Returns number of nodes in created tree.
308
309	There is also longer version if you want to recode your data charset
310	into different one (probably UTF-8):
311
312	$root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8');
313
314	Destination encoding is UTF-8 by default, so you don't have to specify it.
315
316	$root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250');
317
318	=cut
319
320	my $iconv;
321	my $iconv_l1;
322
323	sub to_jsfind {
324	my $self = shift;
325
326	my $path = shift \|\| confess "to_jsfind need path to your index!";
327
328	my ($from_cp,$to_cp) = @_;
329
330	$to_cp \|\|= 'UTF-8';
331
332	if ($from_cp && $to_cp) {
333	$iconv = Text::Iconv->new($from_cp,$to_cp);
334	}
335	$iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp);
336
337	$path .= "/" if ($path =~ /\/$/);
338	#carp "creating directory for index '$path'" if (! -w $path);
339
340	return $self->root->to_jsfind($path,"0");
341	}
342
343
344	# private, default cmd function
345	sub default_cmp {
346	$_[0] cmp $_[1];
347	}
348
349	=head3 _recode
350
351	This is internal function to recode charset.
352
353	It will also try to decode HTML::Entities in data.
354
355	=cut
356
357	sub _recode {
358	my $self = shift;
359	my $text = shift \|\| return;
360
361	sub _decode_html_entities {
362	my $data = shift \|\| return;
363	$data = $iconv_l1->convert(decode_entities($data)) \|\| croak "entity decode problem: $data";
364	}
365
366	if ($iconv) {
367	$text = $iconv->convert($text) \|\| $text && carp "convert problem: $text";
368	$text =~ s/(\&\w+;)/_decode_html_entities($1)/ges;
369	}
370
371	return $text;
372	}
373
374	#####################################################################
375
376	=head2 jsFind::Node methods
377
378	Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
379	each has C<k+1> subnodes, which might be null.
380
381	The node is a blessed reference to a list with three elements:
382
383	($keylist, $datalist, $subnodelist)
384
385	each is a reference to a list list.
386
387	The null node is represented by a blessed reference to an empty list.
388
389	=cut
390
391	package jsFind::Node;
392
393	use warnings;
394	use strict;
395
396	use Carp;
397	use File::Path;
398	use Text::Iconv;
399
400	use base 'jsFind';
401
402	my $KEYS = 0;
403	my $DATA = 1;
404	my $SUBNODES = 2;
405
406	=head3 new
407
408	Create New node
409
410	my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
411
412	You can also mit argument list to create empty node.
413
414	my $empty_node = new jsFind::Node;
415
416	=cut
417
418	sub new {
419	my $self = shift;
420	my $package = ref $self \|\| $self;
421	croak "Internal error: jsFind::Node::new called with wrong number of arguments."
422	unless @_ == 3 \|\| @_ == 0;
423	bless [@_] => $package;
424	}
425
426	=head3 locate_key
427
428	Locate key in node using linear search. This should probably be replaced
429	by binary search for better performance.
430
431	my ($found, $index) = $node->locate_key($key, $cmp_coderef);
432
433	Argument C<$cmp_coderef> is optional reference to custom comparison
434	operator.
435
436	Returns (1, $index) if $key[$index] eq $key.
437
438	Returns (0, $index) if key could be found in $subnode[$index].
439
440	In scalar context, just returns 1 or 0.
441
442	=cut
443
444	sub locate_key {
445	# Use linear search for testing, replace with binary search.
446	my $self = shift;
447	my $key = shift;
448	my $cmp = shift \|\| \&jsFind::default_cmp;
449	my $i;
450	my $cmp_result;
451	my $N = $self->size;
452	for ($i = 0; $i < $N; $i++) {
453	$cmp_result = &$cmp($key, $self->key($i));
454	last if $cmp_result <= 0;
455	}
456
457	# $i is now the index of the first node-key greater than $key
458	# or $N if there is no such. $cmp_result is 0 iff the key was found.
459	(!$cmp_result, $i);
460	}
461
462
463	=head3 emptynode
464
465	Creates new empty node
466
467	$node = $root->emptynode;
468	$new_node = $node->emptynode;
469
470	=cut
471
472	sub emptynode {
473	new($_[0]); # Pass package name, but not anything else.
474	}
475
476	=head3 is_empty
477
478	Test if node is empty
479
480	if ($node->is_empty) { something }
481
482	=cut
483
484	# undef is empty; so is a blessed empty list.
485	sub is_empty {
486	my $self = shift;
487	!defined($self) \|\| $#$self < 0;
488	}
489
490	=head3 key
491
492	Return C<$i>th key from node
493
494	my $key = $node->key($i);
495
496	=cut
497
498	sub key {
499	# my ($self, $n) = @_;
500	# $self->[$KEYS][$n];
501
502	# speedup
503	$_[0]->[$KEYS][$_[1]];
504	}
505
506	=head3 data
507
508	Return C<$i>th data from node
509
510	my $data = $node->data($i);
511
512	=cut
513
514	sub data {
515	my ($self, $n) = @_;
516	$self->[$DATA][$n];
517	}
518
519	=head3 kdp_replace
520
521	Set key data pair for C<$i>th element in node
522
523	$node->kdp_replace($i, "key value" => {
524	"data key 1" => "data value 1",
525	"data key 2" => "data value 2",
526	};
527
528	=cut
529
530	sub kdp_replace {
531	my ($self, $n, $k => $d) = @_;
532	if (defined $k) {
533	$self->[$KEYS][$n] = $k;
534	$self->[$DATA][$n] = $d;
535	}
536	[$self->[$KEYS][$n],
537	$self->[$DATA][$n]];
538	}
539
540	=head3 kdp_insert
541
542	# No return value.
543
544	=cut
545
546	sub kdp_insert {
547	my $self = shift;
548	my ($k => $d) = @_;
549	my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
550
551	if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
552
553	# undef fix
554	$where \|\|= 0;
555
556	splice(@{$self->[$KEYS]}, $where, 0, $k);
557	splice(@{$self->[$DATA]}, $where, 0, $d);
558	splice(@{$self->[$SUBNODES]}, $where, 0, undef);
559	}
560
561	=head3 kdp_append
562
563	Adds new data keys and values to C<$i>th element in node
564
565	$node->kdp_append($i, "key value" => {
566	"added data key" => "added data value",
567	};
568
569	=cut
570
571	sub kdp_append {
572	my ($self, $n, $k => $d) = @_;
573	if (defined $k) {
574	$self->[$KEYS][$n] = $k;
575	my ($kv,$dv) = %{$d};
576	$self->[$DATA][$n]->{$kv} = $dv;
577	}
578	[$self->[$KEYS][$n],
579	$self->[$DATA][$n]];
580	}
581
582	=head3 subnode
583
584	Set new or return existing subnode
585
586	# return 4th subnode
587	my $my_node = $node->subnode(4);
588
589	# create new subnode 5 from $my_node
590	$node->subnode(5, $my_node);
591
592	=cut
593
594	sub subnode {
595	my ($self, $n, $newnode) = @_;
596	$self->[$SUBNODES][$n] = $newnode if defined $newnode;
597	$self->[$SUBNODES][$n];
598	}
599
600	=head3 is_leaf
601
602	Test if node is leaf
603
604	if ($node->is_leaf) { something }
605
606	=cut
607
608	sub is_leaf {
609	my $self = shift;
610	! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
611	}
612
613	=head3 size
614
615	Return number of keys in the node
616
617	my $nr = $node->size;
618
619	=cut
620
621	sub size {
622	my $self = shift;
623	return scalar(@{$self->[$KEYS]});
624	}
625
626	=head3 halves
627
628	# Accept an index $n
629	# Divide into two nodes so that keys 0 .. $n-1 are in one node
630	# and keys $n+1 ... $size are in the other.
631
632	=cut
633
634	sub halves {
635	my $self = shift;
636	my $n = shift;
637	my $s = $self->size;
638	my @right;
639	my @left;
640
641	$left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
642	$left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
643	$left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
644
645	$right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
646	$right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
647	$right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
648
649	my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
650
651	($self->new(@left), $self->new(@right), \@middle);
652	}
653
654	=head3 to_string
655
656	Dumps tree as string
657
658	my $str = $root->to_string;
659
660	=cut
661
662	sub to_string {
663	my $self = shift;
664	my $indent = shift \|\| 0;
665	my $I = ' ' x $indent;
666	return '' if $self->is_empty;
667	my ($k, $d, $s) = @$self;
668	my $result = '';
669	$result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
670	my $N = $self->size;
671	my $i;
672	for ($i = 0; $i < $N; $i++) {
673	# $result .= $I . "$k->[$i] => $d->[$i]\n";
674	$result .= $I . "$k->[$i]\n";
675	$result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
676	}
677	$result;
678	}
679
680	=begin comment
681
682	use Data::Dumper;
683
684	sub to_string {
685	my $self = shift;
686	my $indent = shift \|\| 0;
687	my $path = shift \|\| '0';
688	return '' if $self->is_empty;
689	my ($k, $d, $s) = @$self;
690	my $result = '';
691	$result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
692	my $N = $self->size;
693	for (my $i = 0; $i < $N; $i++) {
694	my $dump = Dumper($d->[$i]);
695	$dump =~ s/[\n\r\s]+/ /gs;
696	$dump =~ s/\$VAR1\s=\s//;
697	$result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
698	$result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
699	}
700	$result;
701	}
702
703	=end comment
704
705	=head3 to_dot
706
707	Recursivly walk nodes of tree
708
709	=cut
710
711	sub to_dot {
712	my $self = shift;
713	my $parent = shift;
714
715	return '' if $self->is_empty;
716
717	my $dot = '';
718
719	my ($k, $d, $s) = @$self;
720	my $N = $self->size;
721
722	my @dot_keys;
723
724	my $node_name = $parent \|\| '_';
725	$node_name =~ s/\W+//g;
726	$node_name .= " [$N]";
727
728	for (my $i = 0; $i <= $N; $i++) {
729	if (my $key = $k->[$i]) {
730	push @dot_keys, qq{<$i>$key};
731	}
732	$dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
733	}
734	push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
735
736	my $label = join("\|",@dot_keys);
737	$dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
738
739	$dot .= qq{$parent -> "$node_name";\n} if ($parent);
740
741	$dot;
742	}
743
744	=head3 to_xml
745
746	Escape <, >, & and ", and to produce valid XML
747
748	=cut
749
750	my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
751	my $escape_re = join '\|' => keys %escape;
752
753	sub to_xml {
754	my $self = shift \|\| confess "you should call to_xml as object!";
755
756	my $d = shift \|\| return;
757	$d = $self->SUPER::_recode($d);
758	confess "escape_re undefined!" unless ($escape_re);
759	$d =~ s/($escape_re)/$escape{$1}/g;
760	return $d;
761	}
762
763	=head3 to_jsfind
764
765	Create jsFind xml files
766
767	my $nr=$tree->to_jsfind('/path/to/index','0');
768
769	Returns number of elements created
770
771	=cut
772
773
774	sub to_jsfind {
775	my $self = shift;
776	my ($path,$file) = @_;
777
778	return 0 if $self->is_empty;
779
780	confess("path is undefined.") unless ($path);
781	confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file));
782
783	my $nr_keys = 0;
784
785	my ($k, $d, $s) = @$self;
786	my $N = $self->size;
787
788	my ($key_xml, $data_xml) = ("<n>","<d>");
789
790	for (my $i = 0; $i <= $N; $i++) {
791	my $key = lc($k->[$i]);
792
793	if ($key) {
794	$key_xml .= '<k>'.$self->to_xml($key).'</k>';
795	$data_xml .= '<e>';
796	#use Data::Dumper;
797	#print Dumper($d->[$i]);
798	foreach my $path (keys %{$d->[$i]}) {
799	$data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} \|\| 1).'" t="'.$self->to_xml($d->[$i]->{$path}->{'t'} \|\| 'no title').'">'.$self->to_xml($path).'</l>';
800	$nr_keys++;
801	}
802	$data_xml .= '</e>';
803	}
804
805	$nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
806	}
807
808	$key_xml .= '</n>';
809	$data_xml .= '</d>';
810
811	if (! -e $path) {
812	mkpath($path) \|\| croak "can't create dir '$path': $!";
813	}
814
815	open(K, "> ${path}/${file}.xml") \|\| croak "can't open '$path/$file.xml': $!";
816	open(D, "> ${path}/_${file}.xml") \|\| croak "can't open '$path/_$file.xml': $!";
817
818	print K $key_xml;
819	print D $data_xml;
820
821	close(K);
822	close(D);
823
824	return $nr_keys;
825	}
826
827	1;
828	__END__
829
830	=head1 SEE ALSO
831
832	jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
833
834	B-Trees in perl web site L<http://perl.plover.com/BTree/>
835
836	=head1 AUTHORS
837
838	Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
839	base for this module
840
841	Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
842
843	Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
844
845	=head1 COPYRIGHT AND LICENSE
846
847	Copyright (C) 2004 by Dobrica Pavlinusic
848
849	This program is free software; you can redistribute it and/or modify it
850	under the terms of the GNU General Public License as published by the Free
851	Software Foundation; either version 2 of the License, or (at your option)
852	any later version. This program is distributed in the hope that it will be
853	useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
854	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
855	Public License for more details.
856
857	=cut