--- trunk/Estraier.pm 2006/01/04 21:51:01 14 +++ trunk/Estraier.pm 2006/05/08 21:33:37 132 @@ -4,30 +4,73 @@ use strict; use warnings; -require Exporter; +our $VERSION = '0.06_1'; -our @ISA = qw(Exporter); +=head1 NAME -our %EXPORT_TAGS = ( 'all' => [ qw( -) ] ); +Search::Estraier - pure perl module to use Hyper Estraier search engine -our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); +=head1 SYNOPSIS -our @EXPORT = qw( -); +=head2 Simple indexer -our $VERSION = '0.00'; + use Search::Estraier; -use Carp; + # create and configure node + my $node = new Search::Estraier::Node( + url => 'http://localhost:1978/node/test', + user => 'admin', + passwd => 'admin' + ); -=head1 NAME + # create document + my $doc = new Search::Estraier::Document; -Search::Estraier - pure perl module to use Hyper Estraier search engine + # add attributes + $doc->add_attr('@uri', "http://estraier.gov/example.txt"); + $doc->add_attr('@title', "Over the Rainbow"); -=head1 SYNOPSIS + # add body text to document + $doc->add_text("Somewhere over the rainbow. Way up high."); + $doc->add_text("There's a land that I heard of once in a lullaby."); + + die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) }); + +=head2 Simple searcher + + use Search::Estraier; + + # create and configure node + my $node = new Search::Estraier::Node( + url => 'http://localhost:1978/node/test', + user => 'admin', + passwd => 'admin', + croak_on_error => 1, + ); + + # create condition + my $cond = new Search::Estraier::Condition; + + # set search phrase + $cond->set_phrase("rainbow AND lullaby"); - use Search::Estraier; - my $est = new Search::Estraier(); + my $nres = $node->search($cond, 0); + + if (defined($nres)) { + print "Got ", $nres->hits, " results\n"; + + # for each document in results + for my $i ( 0 ... $nres->doc_num - 1 ) { + # get result document + my $rdoc = $nres->get_doc($i); + # display attribte + print "URI: ", $rdoc->attr('@uri'),"\n"; + print "Title: ", $rdoc->attr('@title'),"\n"; + print $rdoc->snippet,"\n"; + } + } else { + die "error: ", $node->status,"\n"; + } =head1 DESCRIPTION @@ -39,18 +82,44 @@ It is implemented as multiple packages which closly resamble Ruby implementation. It also includes methods to manage nodes. +There are few examples in C directory of this distribution. + =cut +=head1 Inheritable common methods + +This methods should really move somewhere else. + +=head2 _s + +Remove multiple whitespaces from string, as well as whitespaces at beginning or end + + my $text = $self->_s(" this is a text "); + $text = 'this is a text'; + +=cut + +sub _s { + my $text = $_[1]; + return unless defined($text); + $text =~ s/\s\s+/ /gs; + $text =~ s/^\s+//; + $text =~ s/\s+$//; + return $text; +} + package Search::Estraier::Document; use Carp qw/croak confess/; +use Search::Estraier; +our @ISA = qw/Search::Estraier/; + =head1 Search::Estraier::Document This class implements Document which is collection of attributes (key=value), vectors (also key value) display text and hidden text. -Document for HyperEstraier =head2 new @@ -97,12 +166,12 @@ } elsif ($line =~ m/^$/) { $in_text = 1; next; - } elsif ($line =~ m/^(.+)=(.+)$/) { + } elsif ($line =~ m/^(.+)=(.*)$/) { $self->{attrs}->{ $1 } = $2; next; } - warn "draft ignored: $line\n"; + warn "draft ignored: '$line'\n"; } } @@ -128,9 +197,9 @@ while (my ($name, $value) = each %{ $attrs }) { if (! defined($value)) { - delete( $self->{attrs}->{_s($name)} ); + delete( $self->{attrs}->{ $self->_s($name) } ); } else { - $self->{attrs}->{_s($name)} = _s($value); + $self->{attrs}->{ $self->_s($name) } = $self->_s($value); } } @@ -151,7 +220,7 @@ my $text = shift; return unless defined($text); - push @{ $self->{dtexts} }, _s($text); + push @{ $self->{dtexts} }, $self->_s($text); } @@ -168,9 +237,10 @@ my $text = shift; return unless defined($text); - push @{ $self->{htexts} }, _s($text); + push @{ $self->{htexts} }, $self->_s($text); } + =head2 id Get the ID number of document. If the object has never been registred, C<-1> is returned. @@ -184,6 +254,7 @@ return $self->{id}; } + =head2 attr_names Returns array with attribute names from document object. @@ -194,7 +265,8 @@ sub attr_names { my $self = shift; - croak "attr_names return array, not scalar" if (! wantarray); + return unless ($self->{attrs}); + #croak "attr_names return array, not scalar" if (! wantarray); return sort keys %{ $self->{attrs} }; } @@ -210,8 +282,8 @@ sub attr { my $self = shift; my $name = shift; - - return $self->{'attrs'}->{ $name }; + return unless (defined($name) && $self->{attrs}); + return $self->{attrs}->{ $name }; } @@ -225,10 +297,11 @@ sub texts { my $self = shift; - confess "texts return array, not scalar" if (! wantarray); - return @{ $self->{dtexts} }; + #confess "texts return array, not scalar" if (! wantarray); + return @{ $self->{dtexts} } if ($self->{dtexts}); } + =head2 cat_texts Return whole text as single scalar. @@ -239,9 +312,10 @@ sub cat_texts { my $self = shift; - return join(' ',@{ $self->{dtexts} }); + return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts}); } + =head2 dump_draft Dump draft data from document object. @@ -255,7 +329,8 @@ my $draft; foreach my $attr_name (sort keys %{ $self->{attrs} }) { - $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n"; + next unless defined(my $v = $self->{attrs}->{$attr_name}); + $draft .= $attr_name . '=' . $v . "\n"; } if ($self->{kwords}) { @@ -268,18 +343,23 @@ $draft .= "\n"; - $draft .= join("\n", @{ $self->{dtexts} }) . "\n"; - $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n"; + $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); + $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts}); return $draft; } + =head2 delete Empty document object $doc->delete; +This function is addition to original Ruby API, and since it was included in C wrappers it's here as a +convinience. Document objects which go out of scope will be destroyed +automatically. + =cut sub delete { @@ -295,78 +375,1385 @@ } -=head2 _s -Remove multiple whitespaces from string, as well as whitespaces at beginning or end +package Search::Estraier::Condition; - my $text = _s(" this is a text "); - $text = 'this is a text'; +use Carp qw/carp confess croak/; + +use Search::Estraier; +our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::Condition + +=head2 new + + my $cond = new Search::HyperEstraier::Condition; =cut -sub _s { - my $text = shift || return; - $text =~ s/\s\s+/ /gs; - $text =~ s/^\s+//; - $text =~ s/\s+$//; - return $text; +sub new { + my $class = shift; + my $self = {}; + bless($self, $class); + + $self->{max} = -1; + $self->{options} = 0; + + $self ? return $self : return undef; +} + + +=head2 set_phrase + + $cond->set_phrase('search phrase'); + +=cut + +sub set_phrase { + my $self = shift; + $self->{phrase} = $self->_s( shift ); +} + + +=head2 add_attr + + $cond->add_attr('@URI STRINC /~dpavlin/'); + +=cut + +sub add_attr { + my $self = shift; + my $attr = shift || return; + push @{ $self->{attrs} }, $self->_s( $attr ); +} + + +=head2 set_order + + $cond->set_order('@mdate NUMD'); + +=cut + +sub set_order { + my $self = shift; + $self->{order} = shift; } +=head2 set_max -package Search::Estraier::Master; + $cond->set_max(42); -use Carp; +=cut + +sub set_max { + my $self = shift; + my $max = shift; + croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/); + $self->{max} = $max; +} + + +=head2 set_options + + $cond->set_options( 'SURE' ); + + $cond->set_options( qw/AGITO NOIDF SIMPLE/ ); + +Possible options are: + +=over 8 + +=item SURE + +check every N-gram + +=item USUAL + +check every second N-gram + +=item FAST + +check every third N-gram + +=item AGITO + +check every fourth N-gram + +=item NOIDF + +don't perform TF-IDF tuning + +=item SIMPLE + +use simplified query phrase + +=back -=head1 Search::Estraier::Master +Skipping N-grams will speed up search, but reduce accuracy. Every call to C will reset previous +options; -Controll node master. This requires user with administration priviledges. +This option changed in version C<0.04> of this module. It's backwards compatibile. =cut -{ - package RequestAgent; - @ISA = qw(LWP::UserAgent); +my $options = { + SURE => 1 << 0, + USUAL => 1 << 1, + FAST => 1 << 2, + AGITO => 1 << 3, + NOIDF => 1 << 4, + SIMPLE => 1 << 10, +}; - sub new { - my $self = LWP::UserAgent::new(@_); - $self->agent("Search-Estraier/$Search::Estraer::VERSION"); - $self; +sub set_options { + my $self = shift; + my $opt = 0; + foreach my $option (@_) { + my $mask; + unless ($mask = $options->{$option}) { + if ($option eq '1') { + next; + } else { + croak "unknown option $option"; + } + } + $opt += $mask; } + $self->{options} = $opt; +} + + +=head2 phrase + +Return search phrase. + + print $cond->phrase; + +=cut + +sub phrase { + my $self = shift; + return $self->{phrase}; +} + + +=head2 order + +Return search result order. + + print $cond->order; + +=cut + +sub order { + my $self = shift; + return $self->{order}; +} + + +=head2 attrs + +Return search result attrs. + + my @cond_attrs = $cond->attrs; + +=cut + +sub attrs { + my $self = shift; + #croak "attrs return array, not scalar" if (! wantarray); + return @{ $self->{attrs} } if ($self->{attrs}); +} + + +=head2 max + +Return maximum number of results. + + print $cond->max; + +C<-1> is returned for unitialized value, C<0> is unlimited. + +=cut + +sub max { + my $self = shift; + return $self->{max}; +} + + +=head2 options + +Return options for this condition. + + print $cond->options; + +Options are returned in numerical form. + +=cut + +sub options { + my $self = shift; + return $self->{options}; +} + + +=head2 set_skip + +Set number of skipped documents from beginning of results + + $cond->set_skip(42); + +Similar to C in RDBMS. + +=cut + +sub set_skip { + my $self = shift; + $self->{skip} = shift; +} + +=head2 skip + +Return skip for this condition. + + print $cond->skip; + +=cut - sub get_basic_credentials { - my($self, $realm, $uri) = @_; -# return ($user, $password); +sub skip { + my $self = shift; + return $self->{skip}; +} + + +package Search::Estraier::ResultDocument; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::ResultDocument + +=head2 new + + my $rdoc = new Search::HyperEstraier::ResultDocument( + uri => 'http://localhost/document/uri/42', + attrs => { + foo => 1, + bar => 2, + }, + snippet => 'this is a text of snippet' + keywords => 'this\tare\tkeywords' + ); + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + croak "missing uri for ResultDocument" unless defined($self->{uri}); + + $self ? return $self : return undef; +} + + +=head2 uri + +Return URI of result document + + print $rdoc->uri; + +=cut + +sub uri { + my $self = shift; + return $self->{uri}; +} + + +=head2 attr_names + +Returns array with attribute names from result document object. + + my @attrs = $rdoc->attr_names; + +=cut + +sub attr_names { + my $self = shift; + croak "attr_names return array, not scalar" if (! wantarray); + return sort keys %{ $self->{attrs} }; +} + + +=head2 attr + +Returns value of an attribute. + + my $value = $rdoc->attr( 'attribute' ); + +=cut + +sub attr { + my $self = shift; + my $name = shift || return; + return $self->{attrs}->{ $name }; +} + + +=head2 snippet + +Return snippet from result document + + print $rdoc->snippet; + +=cut + +sub snippet { + my $self = shift; + return $self->{snippet}; +} + + +=head2 keywords + +Return keywords from result document + + print $rdoc->keywords; + +=cut + +sub keywords { + my $self = shift; + return $self->{keywords}; +} + + +package Search::Estraier::NodeResult; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::NodeResult + +=head2 new + + my $res = new Search::HyperEstraier::NodeResult( + docs => @array_of_rdocs, + hits => %hash_with_hints, + ); + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + foreach my $f (qw/docs hints/) { + croak "missing $f for ResultDocument" unless defined($self->{$f}); } + + $self ? return $self : return undef; +} + + +=head2 doc_num + +Return number of documents + + print $res->doc_num; + +This will return real number of documents (limited by C). +If you want to get total number of hits, see C. + +=cut + +sub doc_num { + my $self = shift; + return $#{$self->{docs}} + 1; +} + + +=head2 get_doc + +Return single document + + my $doc = $res->get_doc( 42 ); + +Returns undef if document doesn't exist. + +=cut + +sub get_doc { + my $self = shift; + my $num = shift; + croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/); + return undef if ($num < 0 || $num > $self->{docs}); + return $self->{docs}->[$num]; +} + + +=head2 hint + +Return specific hint from results. + + print $res->hint( 'VERSION' ); + +Possible hints are: C, C, C, C, C, C, +C