--- trunk/Estraier.pm 2006/01/04 15:28:39 9 +++ trunk/Estraier.pm 2006/01/05 15:33:48 30 @@ -4,22 +4,8 @@ use strict; use warnings; -require Exporter; - -our @ISA = qw(Exporter); - -our %EXPORT_TAGS = ( 'all' => [ qw( -) ] ); - -our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); - -our @EXPORT = qw( -); - our $VERSION = '0.00'; -use Carp; - =head1 NAME Search::Estraier - pure perl module to use Hyper Estraier search engine @@ -41,27 +27,89 @@ =cut +=head2 _s + +Remove multiple whitespaces from string, as well as whitespaces at beginning or end + + my $text = $self->_s(" this is a text "); + $text = 'this is a text'; + +=cut + +sub _s { + my $text = $_[1] || return; + $text =~ s/\s\s+/ /gs; + $text =~ s/^\s+//; + $text =~ s/\s+$//; + return $text; +} + package Search::Estraier::Document; use Carp qw/croak confess/; +use Search::Estraier; +our @ISA = qw/Search::Estraier/; + =head1 Search::Estraier::Document -Document for HyperEstraier +This class implements Document which is collection of attributes +(key=value), vectors (also key value) display text and hidden text. =head2 new +Create new document, empty or from draft. + my $doc = new Search::HyperEstraier::Document; + my $doc2 = new Search::HyperEstraier::Document( $draft ); =cut sub new { my $class = shift; - my $self = {@_}; + my $self = {}; bless($self, $class); $self->{id} = -1; + my $draft = shift; + + if ($draft) { + my $in_text = 0; + foreach my $line (split(/\n/, $draft)) { + + if ($in_text) { + if ($line =~ /^\t/) { + push @{ $self->{htexts} }, substr($line, 1); + } else { + push @{ $self->{dtexts} }, $line; + } + next; + } + + if ($line =~ m/^%VECTOR\t(.+)$/) { + my @fields = split(/\t/, $1); + for my $i ( 0 .. ($#fields - 1) ) { + $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ]; + $i++; + } + next; + } elsif ($line =~ m/^%/) { + # What is this? comment? + #warn "$line\n"; + next; + } elsif ($line =~ m/^$/) { + $in_text = 1; + next; + } elsif ($line =~ m/^(.+)=(.+)$/) { + $self->{attrs}->{ $1 } = $2; + next; + } + + warn "draft ignored: $line\n"; + } + } + $self ? return $self : return undef; } @@ -84,9 +132,9 @@ while (my ($name, $value) = each %{ $attrs }) { if (! defined($value)) { - delete( $self->{attrs}->{_s($name)} ); + delete( $self->{attrs}->{ $self->_s($name) } ); } else { - $self->{attrs}->{_s($name)} = _s($value); + $self->{attrs}->{ $self->_s($name) } = $self->_s($value); } } @@ -107,7 +155,7 @@ my $text = shift; return unless defined($text); - push @{ $self->{dtexts} }, _s($text); + push @{ $self->{dtexts} }, $self->_s($text); } @@ -124,7 +172,7 @@ my $text = shift; return unless defined($text); - push @{ $self->{htexts} }, _s($text); + push @{ $self->{htexts} }, $self->_s($text); } =head2 id @@ -181,18 +229,53 @@ sub texts { my $self = shift; - confess "attr_names return array, not scalar" if (! wantarray); - return $self->{dtexts}; + confess "texts return array, not scalar" if (! wantarray); + return @{ $self->{dtexts} }; +} + +=head2 cat_texts + +Return whole text as single scalar. + + my $text = $doc->cat_texts; + +=cut + +sub cat_texts { + my $self = shift; + return join(' ',@{ $self->{dtexts} }); } =head2 dump_draft +Dump draft data from document object. + print $doc->dump_draft; =cut sub dump_draft { - return 'FIXME'; + my $self = shift; + my $draft; + + foreach my $attr_name (sort keys %{ $self->{attrs} }) { + $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n"; + } + + if ($self->{kwords}) { + $draft .= '%%VECTOR'; + while (my ($key, $value) = each %{ $self->{kwords} }) { + $draft .= "\t$key\t$value"; + } + $draft .= "\n"; + } + + $draft .= "\n"; + + $draft .= join("\n", @{ $self->{dtexts} }) . "\n"; + $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n"; + + return $draft; } =head2 delete @@ -201,37 +284,454 @@ $doc->delete; +This function is addition to original Ruby API, and since it was included in C wrappers it's here as a +convinience. Document objects which go out of scope will be destroyed +automatically. + =cut sub delete { my $self = shift; - foreach my $data (qw/attrs dtexts stexts/) { + foreach my $data (qw/attrs dtexts stexts kwords/) { delete($self->{$data}); } + $self->{id} = -1; + return 1; } -=head2 _s -Remove multiple whitespaces from string, as well as whitespaces at beginning or end +package Search::Estraier::Condition; - my $text = _s(" this is a text "); - $text = 'this is a text'; +use Carp qw/confess croak/; + +use Search::Estraier; +our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::Condition + +=head2 new + + my $cond = new Search::HyperEstraier::Condition; =cut -sub _s { - my $text = shift || return; - $text =~ s/\s\s+/ /gs; - $text =~ s/^\s+//; - $text =~ s/\s+$//; - return $text; +sub new { + my $class = shift; + my $self = {}; + bless($self, $class); + + $self->{max} = -1; + $self->{options} = 0; + + $self ? return $self : return undef; +} + +=head2 set_phrase + + $cond->set_phrase('search phrase'); + +=cut + +sub set_phrase { + my $self = shift; + $self->{phrase} = $self->_s( shift ); +} + +=head2 add_attr + + $cond->add_attr('@URI STRINC /~dpavlin/'); + +=cut + +sub add_attr { + my $self = shift; + my $attr = shift || return; + push @{ $self->{attrs} }, $self->_s( $attr ); +} + +=head2 set_order + + $cond->set_order('@mdate NUMD'); + +=cut + +sub set_order { + my $self = shift; + $self->{order} = shift; +} + +=head2 set_max + + $cond->set_max(42); + +=cut + +sub set_max { + my $self = shift; + my $max = shift; + croak "set_max needs number" unless ($max =~ m/^\d+$/); + $self->{max} = $max; +} + +=head2 set_options + + $cond->set_options( SURE => 1 ); + +=cut + +my $options = { + # check N-gram keys skipping by three + SURE => 1 << 0, + # check N-gram keys skipping by two + USUAL => 1 << 1, + # without TF-IDF tuning + FAST => 1 << 2, + # with the simplified phrase + AGITO => 1 << 3, + # check every N-gram key + NOIDF => 1 << 4, + # check N-gram keys skipping by one + SIMPLE => 1 << 10, +}; + +sub set_options { + my $self = shift; + my $option = shift; + confess "unknown option" unless ($options->{$option}); + $self->{options} ||= $options->{$option}; +} + +=head2 phrase + +Return search phrase. + + print $cond->phrase; + +=cut + +sub phrase { + my $self = shift; + return $self->{phrase}; +} + +=head2 order + +Return search result order. + + print $cond->order; + +=cut + +sub order { + my $self = shift; + return $self->{order}; +} + +=head2 attrs + +Return search result attrs. + + my @cond_attrs = $cond->attrs; + +=cut + +sub attrs { + my $self = shift; + #croak "attrs return array, not scalar" if (! wantarray); + return @{ $self->{attrs} }; +} + +=head2 max + +Return maximum number of results. + + print $cond->max; + +C<-1> is returned for unitialized value, C<0> is unlimited. + +=cut + +sub max { + my $self = shift; + return $self->{max}; +} + +=head2 options + +Return options for this condition. + + print $cond->options; + +Options are returned in numerical form. + +=cut + +sub options { + my $self = shift; + return $self->{options}; +} + + +package Search::Estraier::ResultDocument; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::ResultDocument + +=head2 new + + my $rdoc = new Search::HyperEstraier::ResultDocument( + uri => 'http://localhost/document/uri/42', + attrs => { + foo => 1, + bar => 2, + }, + snippet => 'this is a text of snippet' + keywords => 'this\tare\tkeywords' + ); + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + foreach my $f (qw/uri attrs snippet keywords/) { + croak "missing $f for ResultDocument" unless defined($self->{$f}); + } + + $self ? return $self : return undef; +} + +=head2 uri + +Return URI of result document + + print $rdoc->uri; + +=cut + +sub uri { + my $self = shift; + return $self->{uri}; +} + + +=head2 attr_names + +Returns array with attribute names from result document object. + + my @attrs = $rdoc->attr_names; + +=cut + +sub attr_names { + my $self = shift; + croak "attr_names return array, not scalar" if (! wantarray); + return sort keys %{ $self->{attrs} }; +} + +=head2 attr + +Returns value of an attribute. + + my $value = $rdoc->attr( 'attribute' ); + +=cut + +sub attr { + my $self = shift; + my $name = shift || return; + return $self->{attrs}->{ $name }; +} + +=head2 snippet + +Return snippet from result document + + print $rdoc->snippet; + +=cut + +sub snippet { + my $self = shift; + return $self->{snippet}; +} + +=head2 keywords + +Return keywords from result document + + print $rdoc->keywords; + +=cut + +sub keywords { + my $self = shift; + return $self->{keywords}; +} + + +package Search::Estraier::NodeResult; + +use Carp qw/croak/; + +#use Search::Estraier; +#our @ISA = qw/Search::Estraier/; + +=head1 Search::Estraier::NodeResult + +=head2 new + + my $res = new Search::HyperEstraier::NodeResult( + docs => @array_of_rdocs, + hits => %hash_with_hints, + ); + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + foreach my $f (qw/docs hints/) { + croak "missing $f for ResultDocument" unless defined($self->{$f}); + } + + $self ? return $self : return undef; +} + +=head2 doc_num + +Return number of documents + + print $res->doc_num; + +=cut + +sub doc_num { + my $self = shift; + return $#{$self->{docs}}; +} + +=head2 get_doc + +Return single document + + my $doc = $res->get_doc( 42 ); + +Returns undef if document doesn't exist. + +=cut + +sub get_doc { + my $self = shift; + my $num = shift; + croak "expect number as argument" unless ($num =~ m/^\d+$/); + return undef if ($num < 0 || $num > $self->{docs}); + return $self->{docs}->[$num]; +} + +=head2 hint + +Return specific hint from results. + + print $rec->hint( 'VERSION' ); + +Possible hints are: C, C, C, C, C, C, +C