--- trunk/Estraier.pm 2006/01/05 15:33:48 30 +++ trunk/Estraier.pm 2006/01/07 16:19:31 63 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.00'; +our $VERSION = '0.01'; =head1 NAME @@ -27,6 +27,10 @@ =cut +=head1 Inheritable common methods + +This methods should really move somewhere else. + =head2 _s Remove multiple whitespaces from string, as well as whitespaces at beginning or end @@ -56,6 +60,7 @@ This class implements Document which is collection of attributes (key=value), vectors (also key value) display text and hidden text. + =head2 new Create new document, empty or from draft. @@ -175,6 +180,7 @@ push @{ $self->{htexts} }, $self->_s($text); } + =head2 id Get the ID number of document. If the object has never been registred, C<-1> is returned. @@ -188,6 +194,7 @@ return $self->{id}; } + =head2 attr_names Returns array with attribute names from document object. @@ -198,7 +205,8 @@ sub attr_names { my $self = shift; - croak "attr_names return array, not scalar" if (! wantarray); + return unless ($self->{attrs}); + #croak "attr_names return array, not scalar" if (! wantarray); return sort keys %{ $self->{attrs} }; } @@ -214,8 +222,8 @@ sub attr { my $self = shift; my $name = shift; - - return $self->{'attrs'}->{ $name }; + return unless (defined($name) && $self->{attrs}); + return $self->{attrs}->{ $name }; } @@ -229,10 +237,11 @@ sub texts { my $self = shift; - confess "texts return array, not scalar" if (! wantarray); - return @{ $self->{dtexts} }; + #confess "texts return array, not scalar" if (! wantarray); + return @{ $self->{dtexts} } if ($self->{dtexts}); } + =head2 cat_texts Return whole text as single scalar. @@ -243,9 +252,10 @@ sub cat_texts { my $self = shift; - return join(' ',@{ $self->{dtexts} }); + return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts}); } + =head2 dump_draft Dump draft data from document object. @@ -272,12 +282,13 @@ $draft .= "\n"; - $draft .= join("\n", @{ $self->{dtexts} }) . "\n"; - $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n"; + $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); + $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts}); return $draft; } + =head2 delete Empty document object @@ -330,6 +341,7 @@ $self ? return $self : return undef; } + =head2 set_phrase $cond->set_phrase('search phrase'); @@ -341,6 +353,7 @@ $self->{phrase} = $self->_s( shift ); } + =head2 add_attr $cond->add_attr('@URI STRINC /~dpavlin/'); @@ -353,6 +366,7 @@ push @{ $self->{attrs} }, $self->_s( $attr ); } + =head2 set_order $cond->set_order('@mdate NUMD'); @@ -364,6 +378,7 @@ $self->{order} = shift; } + =head2 set_max $cond->set_max(42); @@ -373,10 +388,11 @@ sub set_max { my $self = shift; my $max = shift; - croak "set_max needs number" unless ($max =~ m/^\d+$/); + croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/); $self->{max} = $max; } + =head2 set_options $cond->set_options( SURE => 1 ); @@ -405,6 +421,7 @@ $self->{options} ||= $options->{$option}; } + =head2 phrase Return search phrase. @@ -418,6 +435,7 @@ return $self->{phrase}; } + =head2 order Return search result order. @@ -431,6 +449,7 @@ return $self->{order}; } + =head2 attrs Return search result attrs. @@ -442,9 +461,10 @@ sub attrs { my $self = shift; #croak "attrs return array, not scalar" if (! wantarray); - return @{ $self->{attrs} }; + return @{ $self->{attrs} } if ($self->{attrs}); } + =head2 max Return maximum number of results. @@ -460,6 +480,7 @@ return $self->{max}; } + =head2 options Return options for this condition. @@ -504,13 +525,12 @@ my $self = {@_}; bless($self, $class); - foreach my $f (qw/uri attrs snippet keywords/) { - croak "missing $f for ResultDocument" unless defined($self->{$f}); - } + croak "missing uri for ResultDocument" unless defined($self->{uri}); $self ? return $self : return undef; } + =head2 uri Return URI of result document @@ -539,6 +559,7 @@ return sort keys %{ $self->{attrs} }; } + =head2 attr Returns value of an attribute. @@ -553,6 +574,7 @@ return $self->{attrs}->{ $name }; } + =head2 snippet Return snippet from result document @@ -566,6 +588,7 @@ return $self->{snippet}; } + =head2 keywords Return keywords from result document @@ -610,6 +633,7 @@ $self ? return $self : return undef; } + =head2 doc_num Return number of documents @@ -620,9 +644,10 @@ sub doc_num { my $self = shift; - return $#{$self->{docs}}; + return $#{$self->{docs}} + 1; } + =head2 get_doc Return single document @@ -636,11 +661,12 @@ sub get_doc { my $self = shift; my $num = shift; - croak "expect number as argument" unless ($num =~ m/^\d+$/); + croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/); return undef if ($num < 0 || $num > $self->{docs}); return $self->{docs}->[$num]; } + =head2 hint Return specific hint from results. @@ -661,7 +687,11 @@ package Search::Estraier::Node; -use Carp qw/croak/; +use Carp qw/carp croak confess/; +use URI; +use MIME::Base64; +use IO::Socket::INET; +use URI::Escape qw/uri_escape/; =head1 Search::Estraier::Node @@ -675,7 +705,7 @@ my $class = shift; my $self = { pxport => -1, - timeout => -1, + timeout => 0, # this used to be -1 dnum => -1, wnum => -1, size => -1.0, @@ -686,9 +716,15 @@ }; bless($self, $class); + my $args = {@_}; + + $self->{debug} = $args->{debug}; + warn "## Node debug on\n" if ($self->{debug}); + $self ? return $self : return undef; } + =head2 set_url Specify URL to node server @@ -702,6 +738,7 @@ $self->{url} = shift; } + =head2 set_proxy Specify proxy server to connect to node server @@ -713,11 +750,12 @@ sub set_proxy { my $self = shift; my ($host,$port) = @_; - croak "proxy port must be number" unless ($port =~ m/^\d+$/); + croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/); $self->{pxhost} = $host; $self->{pxport} = $port; } + =head2 set_timeout Specify timeout of connection in seconds @@ -729,63 +767,756 @@ sub set_timeout { my $self = shift; my $sec = shift; - croak "timeout must be number" unless ($sec =~ m/^\d+$/); + croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/); $self->{timeout} = $sec; } -package Search::Estraier::Master; -use Carp; +=head2 set_auth + +Specify name and password for authentication to node server. + + $node->set_auth('clint','eastwood'); + +=cut + +sub set_auth { + my $self = shift; + my ($login,$passwd) = @_; + my $basic_auth = encode_base64( "$login:$passwd" ); + chomp($basic_auth); + $self->{auth} = $basic_auth; +} + + +=head2 status + +Return status code of last request. + + print $node->status; + +C<-1> means connection failure. + +=cut + +sub status { + my $self = shift; + return $self->{status}; +} + + +=head2 put_doc + +Add a document + + $node->put_doc( $document_draft ) or die "can't add document"; + +Return true on success or false on failture. + +=cut + +sub put_doc { + my $self = shift; + my $doc = shift || return; + return unless ($self->{url} && $doc->isa('Search::Estraier::Document')); + $self->shuttle_url( $self->{url} . '/put_doc', + 'text/x-estraier-draft', + $doc->dump_draft, + undef + ) == 200; +} + + +=head2 out_doc + +Remove a document + + $node->out_doc( document_id ) or "can't remove document"; + +Return true on success or false on failture. + +=cut + +sub out_doc { + my $self = shift; + my $id = shift || return; + return unless ($self->{url}); + croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/); + $self->shuttle_url( $self->{url} . '/out_doc', + 'application/x-www-form-urlencoded', + "id=$id", + undef + ) == 200; +} + + +=head2 out_doc_by_uri + +Remove a registrated document using it's uri + + $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document"; + +Return true on success or false on failture. + +=cut + +sub out_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return unless ($self->{url}); + $self->shuttle_url( $self->{url} . '/out_doc', + 'application/x-www-form-urlencoded', + "uri=" . uri_escape($uri), + undef + ) == 200; +} + + +=head2 edit_doc + +Edit attributes of a document + + $node->edit_doc( $document_draft ) or die "can't edit document"; + +Return true on success or false on failture. + +=cut + +sub edit_doc { + my $self = shift; + my $doc = shift || return; + return unless ($self->{url} && $doc->isa('Search::Estraier::Document')); + $self->shuttle_url( $self->{url} . '/edit_doc', + 'text/x-estraier-draft', + $doc->dump_draft, + undef + ) == 200; +} + + +=head2 get_doc + +Retreive document -=head1 Search::Estraier::Master + my $doc = $node->get_doc( document_id ) or die "can't get document"; -Controll node master. This requires user with administration priviledges. +Return true on success or false on failture. =cut -{ - package RequestAgent; - our @ISA = qw(LWP::UserAgent); +sub get_doc { + my $self = shift; + my $id = shift || return; + return $self->_fetch_doc( id => $id ); +} + + +=head2 get_doc_by_uri + +Retreive document + + my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document"; + +Return true on success or false on failture. + +=cut + +sub get_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri ); +} + + +=head2 get_doc_attr + +Retrieve the value of an atribute from object + + my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or + die "can't get document attribute"; + +=cut + +sub get_doc_attr { + my $self = shift; + my ($id,$name) = @_; + return unless ($id && $name); + return $self->_fetch_doc( id => $id, attr => $name ); +} + + +=head2 get_doc_attr_by_uri + +Retrieve the value of an atribute from object + + my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or + die "can't get document attribute"; + +=cut + +sub get_doc_attr_by_uri { + my $self = shift; + my ($uri,$name) = @_; + return unless ($uri && $name); + return $self->_fetch_doc( uri => $uri, attr => $name ); +} + + +=head2 etch_doc + +Exctract document keywords + + my $keywords = $node->etch_doc( document_id ) or die "can't etch document"; + +=cut - sub new { - my $self = LWP::UserAgent::new(@_); - $self->agent("Search-Estraier/$Search::Estraer::VERSION"); - $self; +sub etch_doc { + my $self = shift; + my $id = shift || return; + return $self->_fetch_doc( id => $id, etch => 1 ); +} + +=head2 etch_doc_by_uri + +Retreive document + + my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document"; + +Return true on success or false on failture. + +=cut + +sub etch_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri, etch => 1 ); +} + + +=head2 uri_to_id + +Get ID of document specified by URI + + my $id = $node->uri_to_id( 'file:///document/uri/42' ); + +=cut + +sub uri_to_id { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 ); +} + + +=head2 _fetch_doc + +Private function used for implementing of C, C, +C, C. + + # this will decode received draft into Search::Estraier::Document object + my $doc = $node->_fetch_doc( id => 42 ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' ); + + # to extract keywords, add etch + my $doc = $node->_fetch_doc( id => 42, etch => 1 ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 ); + + # to get document attrubute add attr + my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' ); + + # more general form which allows implementation of + # uri_to_id + my $id = $node->_fetch_doc( + uri => 'file:///document/uri/42', + path => '/uri_to_id', + chomp_resbody => 1 + ); + +=cut + +sub _fetch_doc { + my $self = shift; + my $a = {@_}; + return unless ( ($a->{id} || $a->{uri}) && $self->{url} ); + + my ($arg, $resbody); + + my $path = $a->{path} || '/get_doc'; + $path = '/etch_doc' if ($a->{etch}); + + if ($a->{id}) { + croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/); + $arg = 'id=' . $a->{id}; + } elsif ($a->{uri}) { + $arg = 'uri=' . uri_escape($a->{uri}); + } else { + confess "unhandled argument. Need id or uri."; + } + + if ($a->{attr}) { + $path = '/get_doc_attr'; + $arg .= '&attr=' . uri_escape($a->{attr}); + $a->{chomp_resbody} = 1; } - sub get_basic_credentials { - my($self, $realm, $uri) = @_; -# return ($user, $password); + my $rv = $self->shuttle_url( $self->{url} . $path, + 'application/x-www-form-urlencoded', + $arg, + \$resbody, + ); + + return if ($rv != 200); + + if ($a->{etch}) { + $self->{kwords} = {}; + return +{} unless ($resbody); + foreach my $l (split(/\n/, $resbody)) { + my ($k,$v) = split(/\t/, $l, 2); + $self->{kwords}->{$k} = $v if ($v); + } + return $self->{kwords}; + } elsif ($a->{chomp_resbody}) { + return unless (defined($resbody)); + chomp($resbody); + return $resbody; + } else { + return new Search::Estraier::Document($resbody); } } +=head2 name -=head2 new + my $node_name = $node->name; -Create new connection to node master. +=cut - my $master = new Search::Estraier::Master( - url => 'http://localhost:1978', - user => 'admin', - passwd => 'admin', - ); +sub name { + my $self = shift; + $self->_set_info unless ($self->{name}); + return $self->{name}; +} + + +=head2 label + + my $node_label = $node->label; =cut -sub new { - my $class = shift; - my $self = {@_}; - bless($self, $class); +sub label { + my $self = shift; + $self->_set_info unless ($self->{label}); + return $self->{label}; +} + + +=head2 doc_num + + my $documents_in_node = $node->doc_num; + +=cut + +sub doc_num { + my $self = shift; + $self->_set_info if ($self->{dnum} < 0); + return $self->{dnum}; +} + + +=head2 word_num + + my $words_in_node = $node->word_num; + +=cut + +sub word_num { + my $self = shift; + $self->_set_info if ($self->{wnum} < 0); + return $self->{wnum}; +} + + +=head2 size + + my $node_size = $node->size; + +=cut + +sub size { + my $self = shift; + $self->_set_info if ($self->{size} < 0); + return $self->{size}; +} + + +=head2 search + +Search documents which match condition + + my $nres = $node->search( $cond, $depth ); + +C<$cond> is C object, while <$depth> specifies +depth for meta search. + +Function results C object. + +=cut + +sub search { + my $self = shift; + my ($cond, $depth) = @_; + return unless ($cond && defined($depth) && $self->{url}); + croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition')); + croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/); + + my $resbody; + + my $rv = $self->shuttle_url( $self->{url} . '/search', + 'application/x-www-form-urlencoded', + $self->cond_to_query( $cond, $depth ), + \$resbody, + ); + return if ($rv != 200); - foreach my $p (qw/url user passwd/) { - croak "need $p" unless ($self->{$p}); + my (@docs, $hints); + + my @lines = split(/\n/, $resbody); + return unless (@lines); + + my $border = $lines[0]; + my $isend = 0; + my $lnum = 1; + + while ( $lnum <= $#lines ) { + my $line = $lines[$lnum]; + $lnum++; + + #warn "## $line\n"; + if ($line && $line =~ m/^\Q$border\E(:END)*$/) { + $isend = $1; + last; + } + + if ($line =~ /\t/) { + my ($k,$v) = split(/\t/, $line, 2); + $hints->{$k} = $v; + } } - $self ? return $self : return undef; + my $snum = $lnum; + + while( ! $isend && $lnum <= $#lines ) { + my $line = $lines[$lnum]; + #warn "# $lnum: $line\n"; + $lnum++; + + if ($line && $line =~ m/^\Q$border\E/) { + if ($lnum > $snum) { + my $rdattrs; + my $rdvector; + my $rdsnippet; + + my $rlnum = $snum; + while ($rlnum < $lnum - 1 ) { + #my $rdline = $self->_s($lines[$rlnum]); + my $rdline = $lines[$rlnum]; + $rlnum++; + last unless ($rdline); + if ($rdline =~ /^%/) { + $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/); + } elsif($rdline =~ /=/) { + $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/); + } else { + confess "invalid format of response"; + } + } + while($rlnum < $lnum - 1) { + my $rdline = $lines[$rlnum]; + $rlnum++; + $rdsnippet .= "$rdline\n"; + } + #warn Dumper($rdvector, $rdattrs, $rdsnippet); + if (my $rduri = $rdattrs->{'@uri'}) { + push @docs, new Search::Estraier::ResultDocument( + uri => $rduri, + attrs => $rdattrs, + snippet => $rdsnippet, + keywords => $rdvector, + ); + } + } + $snum = $lnum; + #warn "### $line\n"; + $isend = 1 if ($line =~ /:END$/); + } + + } + + if (! $isend) { + warn "received result doesn't have :END\n$resbody"; + return; + } + + #warn Dumper(\@docs, $hints); + + return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints ); +} + + +=head2 cond_to_query + +Return URI encoded string generated from Search::Estraier::Condition + + my $args = $node->cond_to_query( $cond, $depth ); + +=cut + +sub cond_to_query { + my $self = shift; + + my $cond = shift || return; + croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition')); + my $depth = shift; + + my @args; + + if (my $phrase = $cond->phrase) { + push @args, 'phrase=' . uri_escape($phrase); + } + + if (my @attrs = $cond->attrs) { + for my $i ( 0 .. $#attrs ) { + push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]); + } + } + + if (my $order = $cond->order) { + push @args, 'order=' . uri_escape($order); + } + + if (my $max = $cond->max) { + push @args, 'max=' . $max; + } else { + push @args, 'max=' . (1 << 30); + } + + if (my $options = $cond->options) { + push @args, 'options=' . $options; + } + + push @args, 'depth=' . $depth if ($depth); + push @args, 'wwidth=' . $self->{wwidth}; + push @args, 'hwidth=' . $self->{hwidth}; + push @args, 'awidth=' . $self->{awidth}; + + return join('&', @args); +} + + +=head2 shuttle_url + +This is method which uses C to communicate with Hyper Estraier node +master. + + my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody ); + +C<$resheads> and C<$resbody> booleans controll if response headers and/or response +body will be saved within object. + +=cut + +use LWP::UserAgent; + +sub shuttle_url { + my $self = shift; + + my ($url, $content_type, $reqbody, $resbody) = @_; + + $self->{status} = -1; + + warn "## $url\n" if ($self->{debug}); + + $url = new URI($url); + if ( + !$url || !$url->scheme || !$url->scheme eq 'http' || + !$url->host || !$url->port || $url->port < 1 + ) { + carp "can't parse $url\n"; + return -1; + } + + my $ua = LWP::UserAgent->new; + $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" ); + + my $req; + if ($reqbody) { + $req = HTTP::Request->new(POST => $url); + } else { + $req = HTTP::Request->new(GET => $url); + } + + $req->headers->header( 'Host' => $url->host . ":" . $url->port ); + $req->headers->header( 'Connection', 'close' ); + $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ); + $req->content_type( $content_type ); + + warn $req->headers->as_string,"\n" if ($self->{debug}); + + if ($reqbody) { + warn "$reqbody\n" if ($self->{debug}); + $req->content( $reqbody ); + } + + my $res = $ua->request($req) || croak "can't make request to $url: $!"; + + warn "## response status: ",$res->status_line,"\n" if ($self->{debug}); + + return -1 if (! $res->is_success); + + ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2); + + $$resbody .= $res->content; + + warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug}); + + return $self->{status}; +} + + +=head2 set_snippet_width + +Set width of snippets in results + + $node->set_snippet_width( $wwidth, $hwidth, $awidth ); + +C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet +is not sent with results. If it is negative, whole document text is sent instead of snippet. + +C<$hwidth> specified width of strings from beginning of string. Default +value is C<96>. Negative or zero value keep previous value. + +C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default. +If negative of zero value is provided previous value is kept unchanged. + +=cut + +sub set_snippet_width { + my $self = shift; + + my ($wwidth, $hwidth, $awidth) = @_; + $self->{wwidth} = $wwidth; + $self->{hwidth} = $hwidth if ($hwidth >= 0); + $self->{awidth} = $awidth if ($awidth >= 0); } +=head2 set_user + +Manage users of node + + $node->set_user( 'name', $mode ); + +C<$mode> can be one of: + +=over 4 + +=item 0 + +delete account + +=item 1 + +set administrative right for user + +=item 2 + +set user account as guest + +=back + +Return true on success, otherwise false. + +=cut + +sub set_user { + my $self = shift; + my ($name, $mode) = @_; + + return unless ($self->{url}); + croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/); + + $self->shuttle_url( $self->{url} . '/_set_user', + 'text/plain', + 'name=' . uri_escape($name) . '&mode=' . $mode, + undef + ) == 200; +} + + +=head2 set_link + +Manage node links + + $node->set_link('http://localhost:1978/node/another', 'another node label', $credit); + +If C<$credit> is negative, link is removed. + +=cut + +sub set_link { + my $self = shift; + my ($url, $label, $credit) = @_; + + return unless ($self->{url}); + croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/); + + my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label); + $reqbody .= '&credit=' . $credit if ($credit > 0); + + $self->shuttle_url( $self->{url} . '/_set_link', + 'text/plain', + $reqbody, + undef + ) == 200; +} + + +=head1 PRIVATE METHODS + +You could call those directly, but you don't have to. I hope. + +=head2 _set_info + +Set information for node + + $node->_set_info; + +=cut + +sub _set_info { + my $self = shift; + + $self->{status} = -1; + return unless ($self->{url}); + + my $resbody; + my $rv = $self->shuttle_url( $self->{url} . '/inform', + 'text/plain', + undef, + \$resbody, + ); + + return if ($rv != 200 || !$resbody); + + # it seems that response can have multiple line endings + $resbody =~ s/[\r\n]+$//; + + ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) = + split(/\t/, $resbody, 5); + +} ###