--- trunk/Estraier.pm 2006/01/04 14:48:11 6 +++ trunk/Estraier.pm 2006/01/06 12:48:14 50 @@ -4,22 +4,8 @@ use strict; use warnings; -require Exporter; - -our @ISA = qw(Exporter); - -our %EXPORT_TAGS = ( 'all' => [ qw( -) ] ); - -our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); - -our @EXPORT = qw( -); - our $VERSION = '0.00'; -use Carp; - =head1 NAME Search::Estraier - pure perl module to use Hyper Estraier search engine @@ -41,25 +27,94 @@ =cut +=head1 Inheritable common methods + +This methods should really move somewhere else. + +=head2 _s + +Remove multiple whitespaces from string, as well as whitespaces at beginning or end + + my $text = $self->_s(" this is a text "); + $text = 'this is a text'; + +=cut + +sub _s { + my $text = $_[1] || return; + $text =~ s/\s\s+/ /gs; + $text =~ s/^\s+//; + $text =~ s/\s+$//; + return $text; +} + package Search::Estraier::Document; +use Carp qw/croak confess/; + +use Search::Estraier; +our @ISA = qw/Search::Estraier/; + =head1 Search::Estraier::Document -Document for HyperEstraier +This class implements Document which is collection of attributes +(key=value), vectors (also key value) display text and hidden text. + =head2 new +Create new document, empty or from draft. + my $doc = new Search::HyperEstraier::Document; + my $doc2 = new Search::HyperEstraier::Document( $draft ); =cut sub new { my $class = shift; - my $self = {@_}; + my $self = {}; bless($self, $class); $self->{id} = -1; + my $draft = shift; + + if ($draft) { + my $in_text = 0; + foreach my $line (split(/\n/, $draft)) { + + if ($in_text) { + if ($line =~ /^\t/) { + push @{ $self->{htexts} }, substr($line, 1); + } else { + push @{ $self->{dtexts} }, $line; + } + next; + } + + if ($line =~ m/^%VECTOR\t(.+)$/) { + my @fields = split(/\t/, $1); + for my $i ( 0 .. ($#fields - 1) ) { + $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ]; + $i++; + } + next; + } elsif ($line =~ m/^%/) { + # What is this? comment? + #warn "$line\n"; + next; + } elsif ($line =~ m/^$/) { + $in_text = 1; + next; + } elsif ($line =~ m/^(.+)=(.+)$/) { + $self->{attrs}->{ $1 } = $2; + next; + } + + warn "draft ignored: $line\n"; + } + } + $self ? return $self : return undef; } @@ -70,7 +125,7 @@ $doc->add_attr( name => 'value' ); -B: delete attribute using +Delete attribute using $doc->add_attr( name => undef ); @@ -81,8 +136,14 @@ my $attrs = {@_}; while (my ($name, $value) = each %{ $attrs }) { - push @{ $self->{attrs}->{_s($name)} }, _s($value); + if (! defined($value)) { + delete( $self->{attrs}->{ $self->_s($name) } ); + } else { + $self->{attrs}->{ $self->_s($name) } = $self->_s($value); + } } + + return 1; } @@ -99,7 +160,7 @@ my $text = shift; return unless defined($text); - push @{ $self->{dtexts} }, _s($text); + push @{ $self->{dtexts} }, $self->_s($text); } @@ -116,9 +177,10 @@ my $text = shift; return unless defined($text); - push @{ $self->{htexts} }, _s($text); + push @{ $self->{htexts} }, $self->_s($text); } + =head2 id Get the ID number of document. If the object has never been registred, C<-1> is returned. @@ -132,89 +194,430 @@ return $self->{id}; } + +=head2 attr_names + +Returns array with attribute names from document object. + + my @attrs = $doc->attr_names; + +=cut + +sub attr_names { + my $self = shift; + croak "attr_names return array, not scalar" if (! wantarray); + return sort keys %{ $self->{attrs} }; +} + + +=head2 attr + +Returns value of an attribute. + + my $value = $doc->attr( 'attribute' ); + +=cut + +sub attr { + my $self = shift; + my $name = shift; + + return $self->{'attrs'}->{ $name }; +} + + +=head2 texts + +Returns array with text sentences. + + my @texts = $doc->texts; + +=cut + +sub texts { + my $self = shift; + confess "texts return array, not scalar" if (! wantarray); + return @{ $self->{dtexts} }; +} + + +=head2 cat_texts + +Return whole text as single scalar. + + my $text = $doc->cat_texts; + +=cut + +sub cat_texts { + my $self = shift; + return join(' ',@{ $self->{dtexts} }); +} + + =head2 dump_draft +Dump draft data from document object. + print $doc->dump_draft; =cut sub dump_draft { + my $self = shift; + my $draft; + + foreach my $attr_name (sort keys %{ $self->{attrs} }) { + $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n"; + } + + if ($self->{kwords}) { + $draft .= '%%VECTOR'; + while (my ($key, $value) = each %{ $self->{kwords} }) { + $draft .= "\t$key\t$value"; + } + $draft .= "\n"; + } + + $draft .= "\n"; + + $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); + $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts}); + + return $draft; } + =head2 delete Empty document object $doc->delete; +This function is addition to original Ruby API, and since it was included in C wrappers it's here as a +convinience. Document objects which go out of scope will be destroyed +automatically. + =cut sub delete { my $self = shift; - foreach my $data (qw/attrs dtexts stexts/) { + foreach my $data (qw/attrs dtexts stexts kwords/) { delete($self->{$data}); } + $self->{id} = -1; + return 1; } -=head2 _s -Remove multiple whitespaces from string, as well as whitespaces at beginning or end +package Search::Estraier::Condition; - my $text = _s(" this is a text "); - $text = 'this is a text'; +use Carp qw/confess croak/; + +use Search::Estraier; +our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::Condition + +=head2 new + + my $cond = new Search::HyperEstraier::Condition; =cut -sub _s { - my $text = shift || return; - $text =~ s/\s\s+/ /gs; - $text =~ s/^\s+//; - $text =~ s/\s+$//; - return $text; +sub new { + my $class = shift; + my $self = {}; + bless($self, $class); + + $self->{max} = -1; + $self->{options} = 0; + + $self ? return $self : return undef; } +=head2 set_phrase -package Search::Estraier::Master; + $cond->set_phrase('search phrase'); -use Carp; +=cut -=head1 Search::Estraier::Master +sub set_phrase { + my $self = shift; + $self->{phrase} = $self->_s( shift ); +} -Controll node master. This requires user with administration priviledges. + +=head2 add_attr + + $cond->add_attr('@URI STRINC /~dpavlin/'); =cut -{ - package RequestAgent; - @ISA = qw(LWP::UserAgent); +sub add_attr { + my $self = shift; + my $attr = shift || return; + push @{ $self->{attrs} }, $self->_s( $attr ); +} - sub new { - my $self = LWP::UserAgent::new(@_); - $self->agent("Search-Estraier/$Search::Estraer::VERSION"); - $self; - } - sub get_basic_credentials { - my($self, $realm, $uri) = @_; -# return ($user, $password); - } +=head2 set_order + + $cond->set_order('@mdate NUMD'); + +=cut + +sub set_order { + my $self = shift; + $self->{order} = shift; +} + + +=head2 set_max + + $cond->set_max(42); + +=cut + +sub set_max { + my $self = shift; + my $max = shift; + croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/); + $self->{max} = $max; +} + + +=head2 set_options + + $cond->set_options( SURE => 1 ); + +=cut + +my $options = { + # check N-gram keys skipping by three + SURE => 1 << 0, + # check N-gram keys skipping by two + USUAL => 1 << 1, + # without TF-IDF tuning + FAST => 1 << 2, + # with the simplified phrase + AGITO => 1 << 3, + # check every N-gram key + NOIDF => 1 << 4, + # check N-gram keys skipping by one + SIMPLE => 1 << 10, +}; + +sub set_options { + my $self = shift; + my $option = shift; + confess "unknown option" unless ($options->{$option}); + $self->{options} ||= $options->{$option}; +} + + +=head2 phrase + +Return search phrase. + + print $cond->phrase; + +=cut + +sub phrase { + my $self = shift; + return $self->{phrase}; +} + + +=head2 order + +Return search result order. + + print $cond->order; + +=cut + +sub order { + my $self = shift; + return $self->{order}; +} + + +=head2 attrs + +Return search result attrs. + + my @cond_attrs = $cond->attrs; + +=cut + +sub attrs { + my $self = shift; + #croak "attrs return array, not scalar" if (! wantarray); + return @{ $self->{attrs} }; +} + + +=head2 max + +Return maximum number of results. + + print $cond->max; + +C<-1> is returned for unitialized value, C<0> is unlimited. + +=cut + +sub max { + my $self = shift; + return $self->{max}; +} + + +=head2 options + +Return options for this condition. + + print $cond->options; + +Options are returned in numerical form. + +=cut + +sub options { + my $self = shift; + return $self->{options}; } +package Search::Estraier::ResultDocument; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::ResultDocument =head2 new -Create new connection to node master. + my $rdoc = new Search::HyperEstraier::ResultDocument( + uri => 'http://localhost/document/uri/42', + attrs => { + foo => 1, + bar => 2, + }, + snippet => 'this is a text of snippet' + keywords => 'this\tare\tkeywords' + ); + +=cut - my $master = new Search::Estraier::Master( - url => 'http://localhost:1978', - user => 'admin', - passwd => 'admin', +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + foreach my $f (qw/uri attrs snippet keywords/) { + croak "missing $f for ResultDocument" unless defined($self->{$f}); + } + + $self ? return $self : return undef; +} + + +=head2 uri + +Return URI of result document + + print $rdoc->uri; + +=cut + +sub uri { + my $self = shift; + return $self->{uri}; +} + + +=head2 attr_names + +Returns array with attribute names from result document object. + + my @attrs = $rdoc->attr_names; + +=cut + +sub attr_names { + my $self = shift; + croak "attr_names return array, not scalar" if (! wantarray); + return sort keys %{ $self->{attrs} }; +} + + +=head2 attr + +Returns value of an attribute. + + my $value = $rdoc->attr( 'attribute' ); + +=cut + +sub attr { + my $self = shift; + my $name = shift || return; + return $self->{attrs}->{ $name }; +} + + +=head2 snippet + +Return snippet from result document + + print $rdoc->snippet; + +=cut + +sub snippet { + my $self = shift; + return $self->{snippet}; +} + + +=head2 keywords + +Return keywords from result document + + print $rdoc->keywords; + +=cut + +sub keywords { + my $self = shift; + return $self->{keywords}; +} + + +package Search::Estraier::NodeResult; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::NodeResult + +=head2 new + + my $res = new Search::HyperEstraier::NodeResult( + docs => @array_of_rdocs, + hits => %hash_with_hints, ); =cut @@ -224,14 +627,673 @@ my $self = {@_}; bless($self, $class); - foreach my $p (qw/url user passwd/) { - croak "need $p" unless ($self->{$p}); + foreach my $f (qw/docs hints/) { + croak "missing $f for ResultDocument" unless defined($self->{$f}); } $self ? return $self : return undef; } +=head2 doc_num + +Return number of documents + + print $res->doc_num; + +=cut + +sub doc_num { + my $self = shift; + return $#{$self->{docs}}; +} + + +=head2 get_doc + +Return single document + + my $doc = $res->get_doc( 42 ); + +Returns undef if document doesn't exist. + +=cut + +sub get_doc { + my $self = shift; + my $num = shift; + croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/); + return undef if ($num < 0 || $num > $self->{docs}); + return $self->{docs}->[$num]; +} + + +=head2 hint + +Return specific hint from results. + + print $rec->hint( 'VERSION' ); + +Possible hints are: C, C, C, C, C, C, +C