--- trunk/Estraier.pm 2006/01/04 22:48:29 18 +++ trunk/Estraier.pm 2006/11/05 15:53:13 186 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.00'; +our $VERSION = '0.08_1'; =head1 NAME @@ -12,8 +12,68 @@ =head1 SYNOPSIS - use Search::Estraier; - my $est = new Search::Estraier(); +=head2 Simple indexer + + use Search::Estraier; + + # create and configure node + my $node = new Search::Estraier::Node( + url => 'http://localhost:1978/node/test', + user => 'admin', + passwd => 'admin', + create => 1, + label => 'Label for node', + croak_on_error => 1, + ); + + # create document + my $doc = new Search::Estraier::Document; + + # add attributes + $doc->add_attr('@uri', "http://estraier.gov/example.txt"); + $doc->add_attr('@title', "Over the Rainbow"); + + # add body text to document + $doc->add_text("Somewhere over the rainbow. Way up high."); + $doc->add_text("There's a land that I heard of once in a lullaby."); + + die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) }); + +=head2 Simple searcher + + use Search::Estraier; + + # create and configure node + my $node = new Search::Estraier::Node( + url => 'http://localhost:1978/node/test', + user => 'admin', + passwd => 'admin', + croak_on_error => 1, + ); + + # create condition + my $cond = new Search::Estraier::Condition; + + # set search phrase + $cond->set_phrase("rainbow AND lullaby"); + + my $nres = $node->search($cond, 0); + + if (defined($nres)) { + print "Got ", $nres->hits, " results\n"; + + # for each document in results + for my $i ( 0 ... $nres->doc_num - 1 ) { + # get result document + my $rdoc = $nres->get_doc($i); + # display attribte + print "URI: ", $rdoc->attr('@uri'),"\n"; + print "Title: ", $rdoc->attr('@title'),"\n"; + print $rdoc->snippet,"\n"; + } + } else { + die "error: ", $node->status,"\n"; + } =head1 DESCRIPTION @@ -25,8 +85,14 @@ It is implemented as multiple packages which closly resamble Ruby implementation. It also includes methods to manage nodes. +There are few examples in C directory of this distribution. + =cut +=head1 Inheritable common methods + +This methods should really move somewhere else. + =head2 _s Remove multiple whitespaces from string, as well as whitespaces at beginning or end @@ -37,7 +103,8 @@ =cut sub _s { - my $text = $_[1] || return; + my $text = $_[1]; + return unless defined($text); $text =~ s/\s\s+/ /gs; $text =~ s/^\s+//; $text =~ s/\s+$//; @@ -53,8 +120,34 @@ =head1 Search::Estraier::Document -This class implements Document which is collection of attributes -(key=value), vectors (also key value) display text and hidden text. +This class implements Document which is single item in Hyper Estraier. + +It's is collection of: + +=over 4 + +=item attributes + +C<< 'key' => 'value' >> pairs which can later be used for filtering of results + +You can add common filters to C in estmaster's C<_conf> +file for better performance. See C in +L. + +=item vectors + +also C<< 'key' => 'value' >> pairs + +=item display text + +Text which will be used to create searchable corpus of your index and +included in snippet output. + +=item hidden text + +Text which will be searchable, but will not be included in snippet. + +=back =head2 new @@ -89,11 +182,15 @@ if ($line =~ m/^%VECTOR\t(.+)$/) { my @fields = split(/\t/, $1); - for my $i ( 0 .. ($#fields - 1) ) { - $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ]; - $i++; + if ($#fields % 2 == 1) { + $self->{kwords} = { @fields }; + } else { + warn "can't decode $line\n"; } next; + } elsif ($line =~ m/^%SCORE\t(.+)$/) { + $self->{score} = $1; + next; } elsif ($line =~ m/^%/) { # What is this? comment? #warn "$line\n"; @@ -101,12 +198,12 @@ } elsif ($line =~ m/^$/) { $in_text = 1; next; - } elsif ($line =~ m/^(.+)=(.+)$/) { + } elsif ($line =~ m/^(.+)=(.*)$/) { $self->{attrs}->{ $1 } = $2; next; } - warn "draft ignored: $line\n"; + warn "draft ignored: '$line'\n"; } } @@ -175,6 +272,54 @@ push @{ $self->{htexts} }, $self->_s($text); } +=head2 add_vectors + +Add a vectors + + $doc->add_vector( + 'vector_name' => 42, + 'another' => 12345, + ); + +=cut + +sub add_vectors { + my $self = shift; + return unless (@_); + + # this is ugly, but works + die "add_vector needs HASH as argument" unless ($#_ % 2 == 1); + + $self->{kwords} = {@_}; +} + +=head2 set_score + +Set the substitute score + + $doc->set_score(12345); + +=cut + +sub set_score { + my $self = shift; + my $score = shift; + return unless (defined($score)); + $self->{score} = $score; +} + +=head2 score + +Get the substitute score + +=cut + +sub score { + my $self = shift; + return -1 unless (defined($self->{score})); + return $self->{score}; +} + =head2 id Get the ID number of document. If the object has never been registred, C<-1> is returned. @@ -188,6 +333,7 @@ return $self->{id}; } + =head2 attr_names Returns array with attribute names from document object. @@ -198,7 +344,8 @@ sub attr_names { my $self = shift; - croak "attr_names return array, not scalar" if (! wantarray); + return unless ($self->{attrs}); + #croak "attr_names return array, not scalar" if (! wantarray); return sort keys %{ $self->{attrs} }; } @@ -214,8 +361,8 @@ sub attr { my $self = shift; my $name = shift; - - return $self->{'attrs'}->{ $name }; + return unless (defined($name) && $self->{attrs}); + return $self->{attrs}->{ $name }; } @@ -229,10 +376,11 @@ sub texts { my $self = shift; - confess "texts return array, not scalar" if (! wantarray); - return @{ $self->{dtexts} }; + #confess "texts return array, not scalar" if (! wantarray); + return @{ $self->{dtexts} } if ($self->{dtexts}); } + =head2 cat_texts Return whole text as single scalar. @@ -243,9 +391,10 @@ sub cat_texts { my $self = shift; - return join(' ',@{ $self->{dtexts} }); + return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts}); } + =head2 dump_draft Dump draft data from document object. @@ -259,25 +408,31 @@ my $draft; foreach my $attr_name (sort keys %{ $self->{attrs} }) { - $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n"; + next unless defined(my $v = $self->{attrs}->{$attr_name}); + $draft .= $attr_name . '=' . $v . "\n"; } if ($self->{kwords}) { - $draft .= '%%VECTOR'; + $draft .= '%VECTOR'; while (my ($key, $value) = each %{ $self->{kwords} }) { $draft .= "\t$key\t$value"; } $draft .= "\n"; } + if (defined($self->{score}) && $self->{score} >= 0) { + $draft .= "%SCORE\t" . $self->{score} . "\n"; + } + $draft .= "\n"; - $draft .= join("\n", @{ $self->{dtexts} }) . "\n"; - $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n"; + $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); + $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts}); return $draft; } + =head2 delete Empty document object @@ -306,7 +461,7 @@ package Search::Estraier::Condition; -use Carp qw/confess croak/; +use Carp qw/carp confess croak/; use Search::Estraier; our @ISA = qw/Search::Estraier/; @@ -324,9 +479,13 @@ my $self = {}; bless($self, $class); + $self->{max} = -1; + $self->{options} = 0; + $self ? return $self : return undef; } + =head2 set_phrase $cond->set_phrase('search phrase'); @@ -338,6 +497,7 @@ $self->{phrase} = $self->_s( shift ); } + =head2 add_attr $cond->add_attr('@URI STRINC /~dpavlin/'); @@ -350,6 +510,7 @@ push @{ $self->{attrs} }, $self->_s( $attr ); } + =head2 set_order $cond->set_order('@mdate NUMD'); @@ -361,6 +522,7 @@ $self->{order} = shift; } + =head2 set_max $cond->set_max(42); @@ -370,38 +532,81 @@ sub set_max { my $self = shift; my $max = shift; - croak "set_max needs number" unless ($max =~ m/^\d+$/); + croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/); $self->{max} = $max; } + =head2 set_options - $cond->set_options( SURE => 1 ); + $cond->set_options( 'SURE' ); + + $cond->set_options( qw/AGITO NOIDF SIMPLE/ ); + +Possible options are: + +=over 8 + +=item SURE + +check every N-gram + +=item USUAL + +check every second N-gram + +=item FAST + +check every third N-gram + +=item AGITO + +check every fourth N-gram + +=item NOIDF + +don't perform TF-IDF tuning + +=item SIMPLE + +use simplified query phrase + +=back + +Skipping N-grams will speed up search, but reduce accuracy. Every call to C will reset previous +options; + +This option changed in version C<0.04> of this module. It's backwards compatibile. =cut my $options = { - # check N-gram keys skipping by three SURE => 1 << 0, - # check N-gram keys skipping by two USUAL => 1 << 1, - # without TF-IDF tuning FAST => 1 << 2, - # with the simplified phrase AGITO => 1 << 3, - # check every N-gram key NOIDF => 1 << 4, - # check N-gram keys skipping by one SIMPLE => 1 << 10, }; sub set_options { my $self = shift; - my $option = shift; - confess "unknown option" unless ($options->{$option}); - $self->{options} ||= $options->{$option}; + my $opt = 0; + foreach my $option (@_) { + my $mask; + unless ($mask = $options->{$option}) { + if ($option eq '1') { + next; + } else { + croak "unknown option $option"; + } + } + $opt += $mask; + } + $self->{options} = $opt; } + =head2 phrase Return search phrase. @@ -416,59 +621,1491 @@ } -package Search::Estraier::Master; +=head2 order + +Return search result order. + + print $cond->order; -use Carp; +=cut + +sub order { + my $self = shift; + return $self->{order}; +} + + +=head2 attrs -=head1 Search::Estraier::Master +Return search result attrs. -Controll node master. This requires user with administration priviledges. + my @cond_attrs = $cond->attrs; =cut -{ - package RequestAgent; - our @ISA = qw(LWP::UserAgent); +sub attrs { + my $self = shift; + #croak "attrs return array, not scalar" if (! wantarray); + return @{ $self->{attrs} } if ($self->{attrs}); +} - sub new { - my $self = LWP::UserAgent::new(@_); - $self->agent("Search-Estraier/$Search::Estraer::VERSION"); - $self; - } - sub get_basic_credentials { - my($self, $realm, $uri) = @_; -# return ($user, $password); +=head2 max + +Return maximum number of results. + + print $cond->max; + +C<-1> is returned for unitialized value, C<0> is unlimited. + +=cut + +sub max { + my $self = shift; + return $self->{max}; +} + + +=head2 options + +Return options for this condition. + + print $cond->options; + +Options are returned in numerical form. + +=cut + +sub options { + my $self = shift; + return $self->{options}; +} + + +=head2 set_skip + +Set number of skipped documents from beginning of results + + $cond->set_skip(42); + +Similar to C in RDBMS. + +=cut + +sub set_skip { + my $self = shift; + $self->{skip} = shift; +} + +=head2 skip + +Return skip for this condition. + + print $cond->skip; + +=cut + +sub skip { + my $self = shift; + return $self->{skip}; +} + + +=head2 set_distinct + + $cond->set_distinct('@author'); + +=cut + +sub set_distinct { + my $self = shift; + $self->{distinct} = shift; +} + +=head2 distinct + +Return distinct attribute + + print $cond->distinct; + +=cut + +sub distinct { + my $self = shift; + return $self->{distinct}; +} + +=head2 set_mask + +Filter out some links when searching. + +Argument array of link numbers, starting with 0 (current node). + + $cond->set_mask(qw/0 1 4/); + +=cut + +sub set_mask { + my $self = shift; + return unless (@_); + $self->{mask} = \@_; +} + + +package Search::Estraier::ResultDocument; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::ResultDocument + +=head2 new + + my $rdoc = new Search::HyperEstraier::ResultDocument( + uri => 'http://localhost/document/uri/42', + attrs => { + foo => 1, + bar => 2, + }, + snippet => 'this is a text of snippet' + keywords => 'this\tare\tkeywords' + ); + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + croak "missing uri for ResultDocument" unless defined($self->{uri}); + + $self ? return $self : return undef; +} + + +=head2 uri + +Return URI of result document + + print $rdoc->uri; + +=cut + +sub uri { + my $self = shift; + return $self->{uri}; +} + + +=head2 attr_names + +Returns array with attribute names from result document object. + + my @attrs = $rdoc->attr_names; + +=cut + +sub attr_names { + my $self = shift; + croak "attr_names return array, not scalar" if (! wantarray); + return sort keys %{ $self->{attrs} }; +} + + +=head2 attr + +Returns value of an attribute. + + my $value = $rdoc->attr( 'attribute' ); + +=cut + +sub attr { + my $self = shift; + my $name = shift || return; + return $self->{attrs}->{ $name }; +} + + +=head2 snippet + +Return snippet from result document + + print $rdoc->snippet; + +=cut + +sub snippet { + my $self = shift; + return $self->{snippet}; +} + + +=head2 keywords + +Return keywords from result document + + print $rdoc->keywords; + +=cut + +sub keywords { + my $self = shift; + return $self->{keywords}; +} + + +package Search::Estraier::NodeResult; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::NodeResult + +=head2 new + + my $res = new Search::HyperEstraier::NodeResult( + docs => @array_of_rdocs, + hits => %hash_with_hints, + ); + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + foreach my $f (qw/docs hints/) { + croak "missing $f for ResultDocument" unless defined($self->{$f}); } + + $self ? return $self : return undef; +} + + +=head2 doc_num + +Return number of documents + + print $res->doc_num; + +This will return real number of documents (limited by C). +If you want to get total number of hits, see C. + +=cut + +sub doc_num { + my $self = shift; + return $#{$self->{docs}} + 1; +} + + +=head2 get_doc + +Return single document + + my $doc = $res->get_doc( 42 ); + +Returns undef if document doesn't exist. + +=cut + +sub get_doc { + my $self = shift; + my $num = shift; + croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/); + return undef if ($num < 0 || $num > $self->{docs}); + return $self->{docs}->[$num]; +} + + +=head2 hint + +Return specific hint from results. + + print $res->hint( 'VERSION' ); + +Possible hints are: C, C, C, C, C, C, +C