--- trunk/Estraier.pm 2006/01/06 00:04:28 43 +++ trunk/Estraier.pm 2006/01/06 14:39:45 53 @@ -645,7 +645,7 @@ sub doc_num { my $self = shift; - return $#{$self->{docs}}; + return $#{$self->{docs}} + 1; } @@ -688,10 +688,11 @@ package Search::Estraier::Node; -use Carp qw/carp croak/; +use Carp qw/carp croak confess/; use URI; use MIME::Base64; use IO::Socket::INET; +use URI::Escape qw/uri_escape/; =head1 Search::Estraier::Node @@ -818,7 +819,7 @@ sub put_doc { my $self = shift; my $doc = shift || return; - return unless ($self->{url}); + return unless ($self->{url} && $doc->isa('Search::Estraier::Document')); $self->shuttle_url( $self->{url} . '/put_doc', 'text/x-estraier-draft', $doc->dump_draft, @@ -854,7 +855,7 @@ Remove a registrated document using it's uri - $node->out_doc_by_uri( 'file:///document_url' ) or "can't remove document"; + $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document"; Return true on success or false on failture. @@ -866,7 +867,7 @@ return unless ($self->{url}); $self->shuttle_url( $self->{url} . '/out_doc', 'application/x-www-form-urlencoded', - "uri=$uri", + "uri=" . uri_escape($uri), undef ) == 200; } @@ -885,7 +886,7 @@ sub edit_doc { my $self = shift; my $doc = shift || return; - return unless ($self->{url}); + return unless ($self->{url} && $doc->isa('Search::Estraier::Document')); $self->shuttle_url( $self->{url} . '/edit_doc', 'text/x-estraier-draft', $doc->dump_draft, @@ -910,11 +911,12 @@ return $self->_fetch_doc( id => $id ); } + =head2 get_doc_by_uri Retreive document - my $doc = $node->get_doc_by_uri( 'file:///document_uri' ) or die "can't get document"; + my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document"; Return true on success or false on failture. @@ -926,32 +928,389 @@ return $self->_fetch_doc( uri => $uri ); } + +=head2 get_doc_attr + +Retrieve the value of an atribute from object + + my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or + die "can't get document attribute"; + +=cut + +sub get_doc_attr { + my $self = shift; + my ($id,$name) = @_; + return unless ($id && $name); + return $self->_fetch_doc( id => $id, attr => $name ); +} + + +=head2 get_doc_attr_by_uri + +Retrieve the value of an atribute from object + + my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or + die "can't get document attribute"; + +=cut + +sub get_doc_attr_by_uri { + my $self = shift; + my ($uri,$name) = @_; + return unless ($uri && $name); + return $self->_fetch_doc( uri => $uri, attr => $name ); +} + + +=head2 etch_doc + +Exctract document keywords + + my $keywords = $node->etch_doc( document_id ) or die "can't etch document"; + +=cut + +sub etch_doc { + my $self = shift; + my $id = shift || return; + return $self->_fetch_doc( id => $id, etch => 1 ); +} + +=head2 etch_doc_by_uri + +Retreive document + + my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document"; + +Return true on success or false on failture. + +=cut + +sub etch_doc_by_uri { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri, etch => 1 ); +} + + +=head2 uri_to_id + +Get ID of document specified by URI + + my $id = $node->uri_to_id( 'file:///document/uri/42' ); + +=cut + +sub uri_to_id { + my $self = shift; + my $uri = shift || return; + return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 ); +} + + =head2 _fetch_doc -Private function used for implementation of C and C. +Private function used for implementing of C, C, +C, C. - my $doc = $node->fetch_doc( id => 42 ); - my $doc = $node->fetch_doc( uri => 'file://uri/42' ); + # this will decode received draft into Search::Estraier::Document object + my $doc = $node->_fetch_doc( id => 42 ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' ); + + # to extract keywords, add etch + my $doc = $node->_fetch_doc( id => 42, etch => 1 ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 ); + + # to get document attrubute add attr + my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' ); + my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' ); + + # more general form which allows implementation of + # uri_to_id + my $id = $node->_fetch_doc( + uri => 'file:///document/uri/42', + path => '/uri_to_id', + chomp_resbody => 1 + ); =cut sub _fetch_doc { my $self = shift; - my ($name,$val) = @_; - return unless ($name && defined($val) && $self->{url}); - if ($name eq 'id') { - croak "id must be numberm not '$val'" unless ($val =~ m/^\d+$/); + my $a = {@_}; + return unless ( ($a->{id} || $a->{uri}) && $self->{url} ); + + my ($arg, $resbody); + + my $path = $a->{path} || '/get_doc'; + $path = '/etch_doc' if ($a->{etch}); + + if ($a->{id}) { + croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/); + $arg = 'id=' . $a->{id}; + } elsif ($a->{uri}) { + $arg = 'uri=' . uri_escape($a->{uri}); + } else { + confess "unhandled argument. Need id or uri."; + } + + if ($a->{attr}) { + $path = '/get_doc_attr'; + $arg .= '&attr=' . uri_escape($a->{attr}); + $a->{chomp_resbody} = 1; } - my $rv = $self->shuttle_url( $self->{url} . '/get_doc', + + my $rv = $self->shuttle_url( $self->{url} . $path, + 'application/x-www-form-urlencoded', + $arg, + \$resbody, + ); + + return if ($rv != 200); + + if ($a->{etch}) { + $self->{kwords} = {}; + return +{} unless ($resbody); + foreach my $l (split(/\n/, $resbody)) { + my ($k,$v) = split(/\t/, $l, 2); + $self->{kwords}->{$k} = $v if ($v); + } + return $self->{kwords}; + } elsif ($a->{chomp_resbody}) { + return unless (defined($resbody)); + chomp($resbody); + return $resbody; + } else { + return new Search::Estraier::Document($resbody); + } +} + + +=head2 name + + my $node_name = $node->name; + +=cut + +sub name { + my $self = shift; + $self->set_info unless ($self->{name}); + return $self->{name}; +} + + +=head2 label + + my $node_label = $node->label; + +=cut + +sub label { + my $self = shift; + $self->set_info unless ($self->{label}); + return $self->{label}; +} + + +=head2 doc_num + + my $documents_in_node = $node->doc_num; + +=cut + +sub doc_num { + my $self = shift; + $self->set_info if ($self->{dnum} < 0); + return $self->{dnum}; +} + + +=head2 word_num + + my $words_in_node = $node->word_num; + +=cut + +sub word_num { + my $self = shift; + $self->set_info if ($self->{wnum} < 0); + return $self->{wnum}; +} + + +=head2 size + + my $node_size = $node->size; + +=cut + +sub size { + my $self = shift; + $self->set_info if ($self->{size} < 0); + return $self->{size}; +} + + +=head2 search + +Search documents which match condition + + my $nres = $node->search( $cond, $depth ); + +C<$cond> is C object, while <$depth> specifies +depth for meta search. + +Function results C object. + +=cut + +sub search { + my $self = shift; + my ($cond, $depth) = @_; + return unless ($cond && defined($depth) && $self->{url}); + croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition')); + croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/); + + my $resbody; + + my $rv = $self->shuttle_url( $self->{url} . '/search', 'application/x-www-form-urlencoded', - "$name=$val", - my $draft, + $self->cond_to_query( $cond ), + \$resbody, ); return if ($rv != 200); - return new Search::Estraier::Document($draft); + + my (@docs, $hints); + + my @lines = split(/\n/, $resbody); + return unless (@lines); + + my $border = $lines[0]; + my $isend = 0; + my $lnum = 1; + + while ( $lnum <= $#lines ) { + my $line = $lines[$lnum]; + $lnum++; + + #warn "## $line\n"; + if ($line && $line =~ m/^\Q$border\E(:END)*$/) { + $isend = $1; + last; + } + + if ($line =~ /\t/) { + my ($k,$v) = split(/\t/, $line, 2); + $hints->{$k} = $v; + } + } + + my $snum = $lnum; + + while( ! $isend && $lnum <= $#lines ) { + my $line = $lines[$lnum]; + #warn "# $lnum: $line\n"; + $lnum++; + + if ($line && $line =~ m/^\Q$border\E/) { + if ($lnum > $snum) { + my $rdattrs; + my $rdvector; + my $rdsnippet; + + my $rlnum = $snum; + while ($rlnum < $lnum - 1 ) { + #my $rdline = $self->_s($lines[$rlnum]); + my $rdline = $lines[$rlnum]; + $rlnum++; + last unless ($rdline); + if ($rdline =~ /^%/) { + $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/); + } elsif($rdline =~ /=/) { + $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/); + } else { + confess "invalid format of response"; + } + } + while($rlnum < $lnum - 1) { + my $rdline = $lines[$rlnum]; + $rlnum++; + $rdsnippet .= "$rdline\n"; + } + #warn Dumper($rdvector, $rdattrs, $rdsnippet); + if (my $rduri = $rdattrs->{'@uri'}) { + push @docs, new Search::Estraier::ResultDocument( + uri => $rduri, + attrs => $rdattrs, + snippet => $rdsnippet, + keywords => $rdvector, + ); + } + } + $snum = $lnum; + #warn "### $line\n"; + $isend = 1 if ($line =~ /:END$/); + } + + } + + if (! $isend) { + warn "received result doesn't have :END\n$resbody"; + return; + } + + #warn Dumper(\@docs, $hints); + + return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints ); } +=head2 cond_to_query + + my $args = $node->cond_to_query( $cond ); + +=cut + +sub cond_to_query { + my $self = shift; + + my $cond = shift || return; + croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition')); + + my @args; + + if (my $phrase = $cond->phrase) { + push @args, 'phrase=' . uri_escape($phrase); + } + + if (my @attrs = $cond->attrs) { + for my $i ( 0 .. $#attrs ) { + push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ); + } + } + + if (my $order = $cond->order) { + push @args, 'order=' . uri_escape($order); + } + + if (my $max = $cond->max) { + push @args, 'max=' . $max; + } else { + push @args, 'max=' . (1 << 30); + } + + if (my $options = $cond->options) { + push @args, 'options=' . $options; + } + + push @args, 'depth=' . $self->{depth} if ($self->{depth}); + push @args, 'wwidth=' . $self->{wwidth}; + push @args, 'hwidth=' . $self->{hwidth}; + push @args, 'awidth=' . $self->{awidth}; + + return join('&', @args); +} =head2 shuttle_url @@ -959,7 +1318,7 @@ This is method which uses C to communicate with Hyper Estraier node master. - my $rv = shuttle_url( $url, $content_type, \$req_body, \$resbody ); + my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody ); C<$resheads> and C<$resbody> booleans controll if response headers and/or response body will be saved within object. @@ -1065,6 +1424,37 @@ return $self->{status}; } + +=head2 set_info + +Set information for node + + $node->set_info; + +=cut + +sub set_info { + my $self = shift; + + $self->{status} = -1; + return unless ($self->{url}); + + my $resbody; + my $rv = $self->shuttle_url( $self->{url} . '/inform', + 'text/plain', + undef, + \$resbody, + ); + + return if ($rv != 200 || !$resbody); + + chomp($resbody); + + ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) = + split(/\t/, $resbody, 5); + +} + ### =head1 EXPORT