--- trunk/Estraier.pm 2006/01/05 21:51:54 36 +++ trunk/Estraier.pm 2006/01/06 01:36:09 45 @@ -27,6 +27,10 @@ =cut +=head1 Inheritable common methods + +This methods should really move somewhere else. + =head2 _s Remove multiple whitespaces from string, as well as whitespaces at beginning or end @@ -56,6 +60,7 @@ This class implements Document which is collection of attributes (key=value), vectors (also key value) display text and hidden text. + =head2 new Create new document, empty or from draft. @@ -175,6 +180,7 @@ push @{ $self->{htexts} }, $self->_s($text); } + =head2 id Get the ID number of document. If the object has never been registred, C<-1> is returned. @@ -188,6 +194,7 @@ return $self->{id}; } + =head2 attr_names Returns array with attribute names from document object. @@ -233,6 +240,7 @@ return @{ $self->{dtexts} }; } + =head2 cat_texts Return whole text as single scalar. @@ -246,6 +254,7 @@ return join(' ',@{ $self->{dtexts} }); } + =head2 dump_draft Dump draft data from document object. @@ -272,12 +281,13 @@ $draft .= "\n"; - $draft .= join("\n", @{ $self->{dtexts} }) . "\n"; - $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n"; + $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); + $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts}); return $draft; } + =head2 delete Empty document object @@ -330,6 +340,7 @@ $self ? return $self : return undef; } + =head2 set_phrase $cond->set_phrase('search phrase'); @@ -341,6 +352,7 @@ $self->{phrase} = $self->_s( shift ); } + =head2 add_attr $cond->add_attr('@URI STRINC /~dpavlin/'); @@ -353,6 +365,7 @@ push @{ $self->{attrs} }, $self->_s( $attr ); } + =head2 set_order $cond->set_order('@mdate NUMD'); @@ -364,6 +377,7 @@ $self->{order} = shift; } + =head2 set_max $cond->set_max(42); @@ -373,10 +387,11 @@ sub set_max { my $self = shift; my $max = shift; - croak "set_max needs number" unless ($max =~ m/^\d+$/); + croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/); $self->{max} = $max; } + =head2 set_options $cond->set_options( SURE => 1 ); @@ -405,6 +420,7 @@ $self->{options} ||= $options->{$option}; } + =head2 phrase Return search phrase. @@ -418,6 +434,7 @@ return $self->{phrase}; } + =head2 order Return search result order. @@ -431,6 +448,7 @@ return $self->{order}; } + =head2 attrs Return search result attrs. @@ -445,6 +463,7 @@ return @{ $self->{attrs} }; } + =head2 max Return maximum number of results. @@ -460,6 +479,7 @@ return $self->{max}; } + =head2 options Return options for this condition. @@ -511,6 +531,7 @@ $self ? return $self : return undef; } + =head2 uri Return URI of result document @@ -539,6 +560,7 @@ return sort keys %{ $self->{attrs} }; } + =head2 attr Returns value of an attribute. @@ -553,6 +575,7 @@ return $self->{attrs}->{ $name }; } + =head2 snippet Return snippet from result document @@ -566,6 +589,7 @@ return $self->{snippet}; } + =head2 keywords Return keywords from result document @@ -610,6 +634,7 @@ $self ? return $self : return undef; } + =head2 doc_num Return number of documents @@ -623,6 +648,7 @@ return $#{$self->{docs}}; } + =head2 get_doc Return single document @@ -636,11 +662,12 @@ sub get_doc { my $self = shift; my $num = shift; - croak "expect number as argument" unless ($num =~ m/^\d+$/); + croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/); return undef if ($num < 0 || $num > $self->{docs}); return $self->{docs}->[$num]; } + =head2 hint Return specific hint from results. @@ -661,7 +688,7 @@ package Search::Estraier::Node; -use Carp qw/croak/; +use Carp qw/carp croak confess/; use URI; use MIME::Base64; use IO::Socket::INET; @@ -689,9 +716,15 @@ }; bless($self, $class); + if (@_) { + $self->{debug} = shift; + warn "## Node debug on\n"; + } + $self ? return $self : return undef; } + =head2 set_url Specify URL to node server @@ -705,6 +738,7 @@ $self->{url} = shift; } + =head2 set_proxy Specify proxy server to connect to node server @@ -716,11 +750,12 @@ sub set_proxy { my $self = shift; my ($host,$port) = @_; - croak "proxy port must be number" unless ($port =~ m/^\d+$/); + croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/); $self->{pxhost} = $host; $self->{pxport} = $port; } + =head2 set_timeout Specify timeout of connection in seconds @@ -732,10 +767,11 @@ sub set_timeout { my $self = shift; my $sec = shift; - croak "timeout must be number" unless ($sec =~ m/^\d+$/); + croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/); $self->{timeout} = $sec; } + =head2 set_auth Specify name and password for authentication to node server. @@ -747,14 +783,17 @@ sub set_auth { my $self = shift; my ($login,$passwd) = @_; - $self->{auth} = encode_base64( "$login:$passwd" ); + my $basic_auth = encode_base64( "$login:$passwd" ); + chomp($basic_auth); + $self->{auth} = $basic_auth; } + =head2 status Return status code of last request. - print $res->status; + print $node->status; C<-1> means connection failure. @@ -765,6 +804,246 @@ return $self->{status}; } + +=head2 put_doc + +Add a document + + $node->put_doc( $document_draft ) or die "can't add document"; + +Return true on success or false on failture. + +=cut + +sub put_doc { + my $self = shift; + my $doc = shift || return; + return unless ($self->{url}); + $self->shuttle_url( $self->{url} . '/put_doc', + 'text/x-estraier-draft', + $doc->dump_draft, + undef + ) == 200; +} + + +=head2 out_doc + +Remove a document + + $node->out_doc( document_id ) or "can't remove document"; + +Return true on success or false on failture. + +=cut + +sub out_doc { + my $self = shift; + my $id = shift || return; + return unless ($self->{url}); + croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/); + $self->shuttle_url( $self->{url} . '/out_doc', + 'application/x-www-form-urlencoded', + "id=$id", + undef + ) == 200; +} + + +=head2 out_doc_by_uri + +Remove a registrated document using it's uri + + $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document"; + +Return true on success or false on failture. + +=cut + +sub out_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return unless ($self->{url}); + $self->shuttle_url( $self->{url} . '/out_doc', + 'application/x-www-form-urlencoded', + "uri=$uri", + undef + ) == 200; +} + + +=head2 edit_doc + +Edit attributes of a document + + $node->edit_doc( $document_draft ) or die "can't edit document"; + +Return true on success or false on failture. + +=cut + +sub edit_doc { + my $self = shift; + my $doc = shift || return; + return unless ($self->{url}); + $self->shuttle_url( $self->{url} . '/edit_doc', + 'text/x-estraier-draft', + $doc->dump_draft, + undef + ) == 200; +} + + +=head2 get_doc + +Retreive document + + my $doc = $node->get_doc( document_id ) or die "can't get document"; + +Return true on success or false on failture. + +=cut + +sub get_doc { + my $self = shift; + my $id = shift || return; + return $self->_fetch_doc( id => $id ); +} + + +=head2 get_doc_by_uri + +Retreive document + + my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document"; + +Return true on success or false on failture. + +=cut + +sub get_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri ); +} + + +=head2 etch_doc + +Exctract document keywords + + my $keywords = $node->etch_doc( document_id ) or die "can't etch document"; + +=cut + +sub erch_doc { + my $self = shift; + my $id = shift || return; + return $self->_fetch_doc( id => $id, etch => 1 ); +} + +=head2 etch_doc_by_uri + +Retreive document + + my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document"; + +Return true on success or false on failture. + +=cut + +sub etch_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri, etch => 1 ); +} + + +=head2 uri_to_id + +Get ID of document specified by URI + + my $id = $node->uri_to_id( 'file:///document/uri/42' ); + +=cut + +sub uri_to_id { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 ); +} + + +=head2 _fetch_doc + +Private function used for implementing of C, C, +C, C. + + # this will decode received draft into Search::Estraier::Document object + my $doc = $node->_fetch_doc( id => 42 ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' ); + + # to extract keywords, add etch + my $doc = $node->_fetch_doc( id => 42, etch => 1 ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 ); + + # more general form which allows implementation of + # uri_to_id + my $id = $node->_fetch_doc( + uri => 'file:///document/uri/42', + path => '/uri_to_id', + chomp_resbody => 1 + ); + +=cut + +sub _fetch_doc { + my $self = shift; + my $a = {@_}; + return unless ( ($a->{id} || $a->{uri}) && $self->{url} ); + + my ($arg, $resbody); + + my $path = $a->{path} || '/get_doc'; + $path = '/etch_doc' if ($a->{etch}); + + if ($a->{id}) { + croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/); + $arg = 'id=' . $a->{id}; + } elsif ($a->{uri}) { + $arg = 'uri=' . $a->{uri}; + } else { + confess "unhandled argument. Need id or uri."; + } + + my $rv = $self->shuttle_url( $self->{url} . $path, + 'application/x-www-form-urlencoded', + $arg, + \$resbody, + ); + + return if ($rv != 200); + + if ($a->{etch}) { + $self->{kwords} = {}; + return +{} unless ($resbody); + foreach my $l (split(/\n/, $resbody)) { + my ($k,$v) = split(/\t/, $l, 2); + $self->{kwords}->{$k} = $v if ($v); + } + return $self->{kwords}; + } elsif ($a->{chomp_resbody}) { + return unless (defined($resbody)); + chomp($resbody); + return $resbody; + } else { + return new Search::Estraier::Document($resbody); + } +} + + + + =head2 shuttle_url This is method which uses C to communicate with Hyper Estraier node @@ -782,12 +1061,18 @@ my ($url, $content_type, $reqbody, $resbody) = @_; - my $status = -1; + $self->{status} = -1; - warn $url; + warn "## $url\n" if ($self->{debug}); $url = new URI($url); - return -1 unless ($url && $url->scheme && $url->scheme eq 'http' && $url->host && $url->port > 1); + if ( + !$url || !$url->scheme || !$url->scheme eq 'http' || + !$url->host || !$url->port || $url->port < 1 + ) { + carp "can't parse $url\n"; + return -1; + } my ($host,$port,$query) = ($url->host, $url->port, $url->path); @@ -796,33 +1081,51 @@ $query = "http://$host:$port/$query"; } - $query .= '?' + $url->query if ($url->query && ! $reqbody); + $query .= '?' . $url->query if ($url->query && ! $reqbody); - my $sock = IO::Socket::INET->new( - PeerAddr => $host, - PeerPort => $port, - Proto => 'tcp', - Timeout => $self->{timeout} || 90, - ) || return -1; + my $headers; if ($reqbody) { - print $sock "POST $query HTTP/1.0\r\n"; + $headers .= "POST $query HTTP/1.0\r\n"; } else { - print $sock "GET $query HTTP/1.0\r\n"; + $headers .= "GET $query HTTP/1.0\r\n"; } - print $sock "Host: $url->host:$url->port\r\n"; - print $sock "Connection: close\r\n"; - print $sock "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n"; - print $sock "Content-Type $content_type\r\n"; - print $sock "Authorization: Basic $self->{auth}\r\n"; + $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n"; + $headers .= "Connection: close\r\n"; + $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n"; + $headers .= "Content-Type: $content_type\r\n"; + $headers .= "Authorization: Basic $self->{auth}\r\n"; + my $len = 0; { use bytes; - print $sock "Content-Length: ", length($reqbody), "\r\n"; + $len = length($reqbody) if ($reqbody); } - print $sock "\r\n"; + $headers .= "Content-Length: $len\r\n"; + $headers .= "\r\n"; - print $sock $$reqbody if ($reqbody); + my $sock = IO::Socket::INET->new( + PeerAddr => $host, + PeerPort => $port, + Proto => 'tcp', + Timeout => $self->{timeout} || 90, + ); + + if (! $sock) { + carp "can't open socket to $host:$port"; + return -1; + } + + warn $headers if ($self->{debug}); + + print $sock $headers or + carp "can't send headers to network:\n$headers\n" and return -1; + + if ($reqbody) { + warn "$reqbody\n" if ($self->{debug}); + print $sock $reqbody or + carp "can't send request body to network:\n$$reqbody\n" and return -1; + } my $line = <$sock>; chomp($line); @@ -830,21 +1133,26 @@ return if ($schema !~ /^HTTP/ || ! $res_status); $self->{status} = $res_status; + warn "## response status: $res_status\n" if ($self->{debug}); # skip rest of headers - do { + $line = <$sock>; + while ($line) { $line = <$sock>; - chomp($line); - } until ($line eq ''); + $line =~ s/[\r\n]+$//; + warn "## ", $line || 'NULL', " ##\n" if ($self->{debug}); + }; # read body - my $len = 0; + $len = 0; do { $len = read($sock, my $buf, 8192); $$resbody .= $buf if ($resbody); } while ($len); - return $status; + warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug}); + + return $self->{status}; } ###