--- trunk/Estraier.pm 2006/02/19 17:13:57 108 +++ trunk/Estraier.pm 2006/11/05 16:26:57 191 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.04_2'; +our $VERSION = '0.08'; =head1 NAME @@ -20,7 +20,10 @@ my $node = new Search::Estraier::Node( url => 'http://localhost:1978/node/test', user => 'admin', - passwd => 'admin' + passwd => 'admin', + create => 1, + label => 'Label for node', + croak_on_error => 1, ); # create document @@ -117,9 +120,34 @@ =head1 Search::Estraier::Document -This class implements Document which is collection of attributes -(key=value), vectors (also key value) display text and hidden text. +This class implements Document which is single item in Hyper Estraier. + +It's is collection of: + +=over 4 + +=item attributes + +C<< 'key' => 'value' >> pairs which can later be used for filtering of results + +You can add common filters to C in estmaster's C<_conf> +file for better performance. See C in +L. +=item vectors + +also C<< 'key' => 'value' >> pairs + +=item display text + +Text which will be used to create searchable corpus of your index and +included in snippet output. + +=item hidden text + +Text which will be searchable, but will not be included in snippet. + +=back =head2 new @@ -154,11 +182,15 @@ if ($line =~ m/^%VECTOR\t(.+)$/) { my @fields = split(/\t/, $1); - for my $i ( 0 .. ($#fields - 1) ) { - $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ]; - $i++; + if ($#fields % 2 == 1) { + $self->{kwords} = { @fields }; + } else { + warn "can't decode $line\n"; } next; + } elsif ($line =~ m/^%SCORE\t(.+)$/) { + $self->{score} = $1; + next; } elsif ($line =~ m/^%/) { # What is this? comment? #warn "$line\n"; @@ -240,6 +272,53 @@ push @{ $self->{htexts} }, $self->_s($text); } +=head2 add_vectors + +Add a vectors + + $doc->add_vector( + 'vector_name' => 42, + 'another' => 12345, + ); + +=cut + +sub add_vectors { + my $self = shift; + return unless (@_); + + # this is ugly, but works + die "add_vector needs HASH as argument" unless ($#_ % 2 == 1); + + $self->{kwords} = {@_}; +} + +=head2 set_score + +Set the substitute score + + $doc->set_score(12345); + +=cut + +sub set_score { + my $self = shift; + my $score = shift; + return unless (defined($score)); + $self->{score} = $score; +} + +=head2 score + +Get the substitute score + +=cut + +sub score { + my $self = shift; + return -1 unless (defined($self->{score})); + return $self->{score}; +} =head2 id @@ -334,13 +413,17 @@ } if ($self->{kwords}) { - $draft .= '%%VECTOR'; + $draft .= '%VECTOR'; while (my ($key, $value) = each %{ $self->{kwords} }) { $draft .= "\t$key\t$value"; } $draft .= "\n"; } + if (defined($self->{score}) && $self->{score} >= 0) { + $draft .= "%SCORE\t" . $self->{score} . "\n"; + } + $draft .= "\n"; $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); @@ -599,6 +682,76 @@ } +=head2 set_skip + +Set number of skipped documents from beginning of results + + $cond->set_skip(42); + +Similar to C in RDBMS. + +=cut + +sub set_skip { + my $self = shift; + $self->{skip} = shift; +} + +=head2 skip + +Return skip for this condition. + + print $cond->skip; + +=cut + +sub skip { + my $self = shift; + return $self->{skip}; +} + + +=head2 set_distinct + + $cond->set_distinct('@author'); + +=cut + +sub set_distinct { + my $self = shift; + $self->{distinct} = shift; +} + +=head2 distinct + +Return distinct attribute + + print $cond->distinct; + +=cut + +sub distinct { + my $self = shift; + return $self->{distinct}; +} + +=head2 set_mask + +Filter out some links when searching. + +Argument array of link numbers, starting with 0 (current node). + + $cond->set_mask(qw/0 1 4/); + +=cut + +sub set_mask { + my $self = shift; + return unless (@_); + $self->{mask} = \@_; +} + + package Search::Estraier::ResultDocument; use Carp qw/croak/; @@ -843,6 +996,10 @@ my $node = new Search::HyperEstraier::Node( url => 'http://localhost:1978/node/test', + user => 'admin', + passwd => 'admin' + create => 1, + label => 'optional node label', debug => 1, croak_on_error => 1 ); @@ -855,6 +1012,22 @@ URL to node +=item user + +specify username for node server authentication + +=item passwd + +password for authentication + +=item create + +create node if it doesn't exists + +=item label + +optional label for new node if C is used + =item debug dumps a B of debugging output @@ -874,26 +1047,43 @@ my $self = { pxport => -1, timeout => 0, # this used to be -1 - dnum => -1, - wnum => -1, - size => -1.0, wwidth => 480, hwidth => 96, awidth => 96, status => -1, }; + bless($self, $class); if ($#_ == 0) { $self->{url} = shift; } else { - my $args = {@_}; - %$self = ( %$self, @_ ); + $self->set_auth( $self->{user}, $self->{passwd} ) if ($self->{user}); + warn "## Node debug on\n" if ($self->{debug}); } + $self->{inform} = { + dnum => -1, + wnum => -1, + size => -1.0, + }; + + if ($self->{create}) { + if (! eval { $self->name } || $@) { + my $name = $1 if ($self->{url} =~ m#/node/([^/]+)/*#); + croak "can't find node name in '$self->{url}'" unless ($name); + my $label = $self->{label} || $name; + $self->master( + action => 'nodeadd', + name => $name, + label => $label, + ) || croak "can't create node $name ($label)"; + } + } + $self ? return $self : return undef; } @@ -984,7 +1174,7 @@ $node->put_doc( $document_draft ) or die "can't add document"; -Return true on success or false on failture. +Return true on success or false on failure. =cut @@ -992,11 +1182,15 @@ my $self = shift; my $doc = shift || return; return unless ($self->{url} && $doc->isa('Search::Estraier::Document')); - $self->shuttle_url( $self->{url} . '/put_doc', + if ($self->shuttle_url( $self->{url} . '/put_doc', 'text/x-estraier-draft', $doc->dump_draft, undef - ) == 200; + ) == 200) { + $self->_clear_info; + return 1; + } + return undef; } @@ -1015,11 +1209,15 @@ my $id = shift || return; return unless ($self->{url}); croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/); - $self->shuttle_url( $self->{url} . '/out_doc', + if ($self->shuttle_url( $self->{url} . '/out_doc', 'application/x-www-form-urlencoded', "id=$id", undef - ) == 200; + ) == 200) { + $self->_clear_info; + return 1; + } + return undef; } @@ -1037,11 +1235,15 @@ my $self = shift; my $uri = shift || return; return unless ($self->{url}); - $self->shuttle_url( $self->{url} . '/out_doc', + if ($self->shuttle_url( $self->{url} . '/out_doc', 'application/x-www-form-urlencoded', "uri=" . uri_escape($uri), undef - ) == 200; + ) == 200) { + $self->_clear_info; + return 1; + } + return undef; } @@ -1059,11 +1261,15 @@ my $self = shift; my $doc = shift || return; return unless ($self->{url} && $doc->isa('Search::Estraier::Document')); - $self->shuttle_url( $self->{url} . '/edit_doc', + if ($self->shuttle_url( $self->{url} . '/edit_doc', 'text/x-estraier-draft', $doc->dump_draft, undef - ) == 200; + ) == 200) { + $self->_clear_info; + return 1; + } + return undef; } @@ -1221,7 +1427,7 @@ $path = '/etch_doc' if ($a->{etch}); if ($a->{id}) { - croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/); + croak "id must be number not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/); $arg = 'id=' . $a->{id}; } elsif ($a->{uri}) { $arg = 'uri=' . uri_escape($a->{uri}); @@ -1270,8 +1476,8 @@ sub name { my $self = shift; - $self->_set_info unless ($self->{name}); - return $self->{name}; + $self->_set_info unless ($self->{inform}->{name}); + return $self->{inform}->{name}; } @@ -1283,8 +1489,8 @@ sub label { my $self = shift; - $self->_set_info unless ($self->{label}); - return $self->{label}; + $self->_set_info unless ($self->{inform}->{label}); + return $self->{inform}->{label}; } @@ -1296,8 +1502,8 @@ sub doc_num { my $self = shift; - $self->_set_info if ($self->{dnum} < 0); - return $self->{dnum}; + $self->_set_info if ($self->{inform}->{dnum} < 0); + return $self->{inform}->{dnum}; } @@ -1309,8 +1515,8 @@ sub word_num { my $self = shift; - $self->_set_info if ($self->{wnum} < 0); - return $self->{wnum}; + $self->_set_info if ($self->{inform}->{wnum} < 0); + return $self->{inform}->{wnum}; } @@ -1322,8 +1528,8 @@ sub size { my $self = shift; - $self->_set_info if ($self->{size} < 0); - return $self->{size}; + $self->_set_info if ($self->{inform}->{size} < 0); + return $self->{inform}->{size}; } @@ -1356,88 +1562,32 @@ ); return if ($rv != 200); - my (@docs, $hints); - - my @lines = split(/\n/, $resbody); - return unless (@lines); - - my $border = $lines[0]; - my $isend = 0; - my $lnum = 1; - - while ( $lnum <= $#lines ) { - my $line = $lines[$lnum]; - $lnum++; - - #warn "## $line\n"; - if ($line && $line =~ m/^\Q$border\E(:END)*$/) { - $isend = $1; - last; - } - - if ($line =~ /\t/) { - my ($k,$v) = split(/\t/, $line, 2); - $hints->{$k} = $v; - } - } - - my $snum = $lnum; - - while( ! $isend && $lnum <= $#lines ) { - my $line = $lines[$lnum]; - #warn "# $lnum: $line\n"; - $lnum++; - - if ($line && $line =~ m/^\Q$border\E/) { - if ($lnum > $snum) { - my $rdattrs; - my $rdvector; - my $rdsnippet; - - my $rlnum = $snum; - while ($rlnum < $lnum - 1 ) { - #my $rdline = $self->_s($lines[$rlnum]); - my $rdline = $lines[$rlnum]; - $rlnum++; - last unless ($rdline); - if ($rdline =~ /^%/) { - $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/); - } elsif($rdline =~ /=/) { - $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/); - } else { - confess "invalid format of response"; - } - } - while($rlnum < $lnum - 1) { - my $rdline = $lines[$rlnum]; - $rlnum++; - $rdsnippet .= "$rdline\n"; - } - #warn Dumper($rdvector, $rdattrs, $rdsnippet); - if (my $rduri = $rdattrs->{'@uri'}) { - push @docs, new Search::Estraier::ResultDocument( - uri => $rduri, - attrs => $rdattrs, - snippet => $rdsnippet, - keywords => $rdvector, - ); - } - } - $snum = $lnum; - #warn "### $line\n"; - $isend = 1 if ($line =~ /:END$/); - } - + my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody; + my $hintsText = splice @records, 0, 2; # starts with empty record + my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm }; + + # process records + my $docs = []; + foreach my $record (@records) + { + # split into keys and snippets + my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s; + + # create document hash + my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm }; + $doc->{'@keywords'} = $doc->{keywords}; + ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm; + $doc->{snippet} = $snippet; + + push @$docs, new Search::Estraier::ResultDocument( + attrs => $doc, + uri => $doc->{'@uri'}, + snippet => $snippet, + keywords => $doc->{'keywords'}, + ); } - if (! $isend) { - warn "received result doesn't have :END\n$resbody"; - return; - } - - #warn Dumper(\@docs, $hints); - - return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints ); + return new Search::Estraier::NodeResult( docs => $docs, hints => $hints ); } @@ -1486,6 +1636,18 @@ push @args, 'wwidth=' . $self->{wwidth}; push @args, 'hwidth=' . $self->{hwidth}; push @args, 'awidth=' . $self->{awidth}; + push @args, 'skip=' . $cond->{skip} if ($cond->{skip}); + + if (my $distinct = $cond->distinct) { + push @args, 'distinct=' . uri_escape($distinct); + } + + if ($cond->{mask}) { + my $mask = 0; + map { $mask += ( 2 ** $_ ) } @{ $cond->{mask} }; + + push @args, 'mask=' . $mask if ($mask); + } return join('&', @args); } @@ -1632,7 +1794,7 @@ croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/); $self->shuttle_url( $self->{url} . '/_set_user', - 'text/plain', + 'application/x-www-form-urlencoded', 'name=' . uri_escape($name) . '&mode=' . $mode, undef ) == 200; @@ -1665,9 +1827,10 @@ undef ) == 200) { # refresh node info after adding link - $self->_set_info; + $self->_clear_info; return 1; } + return undef; } =head2 admins @@ -1680,8 +1843,8 @@ sub admins { my $self = shift; - $self->_set_info unless ($self->{name}); - return $self->{admins}; + $self->_set_info unless ($self->{inform}->{name}); + return $self->{inform}->{admins}; } =head2 guests @@ -1694,8 +1857,8 @@ sub guests { my $self = shift; - $self->_set_info unless ($self->{name}); - return $self->{guests}; + $self->_set_info unless ($self->{inform}->{name}); + return $self->{inform}->{guests}; } =head2 links @@ -1708,10 +1871,168 @@ sub links { my $self = shift; - $self->_set_info unless ($self->{name}); - return $self->{links}; + $self->_set_info unless ($self->{inform}->{name}); + return $self->{inform}->{links}; } +=head2 cacheusage + +Return cache usage for a node + + my $cache = $node->cacheusage; + +=cut + +sub cacheusage { + my $self = shift; + + return unless ($self->{url}); + + my $resbody; + my $rv = $self->shuttle_url( $self->{url} . '/cacheusage', + 'text/plain', + undef, + \$resbody, + ); + + return if ($rv != 200 || !$resbody); + + return $resbody; +} + +=head2 master + +Set actions on Hyper Estraier node master (C process) + + $node->master( + action => 'sync' + ); + +All available actions are documented in +L + +=cut + +my $estmaster_rest = { + shutdown => { + status => 202, + }, + sync => { + status => 202, + }, + backup => { + status => 202, + }, + userlist => { + status => 200, + returns => [ qw/name passwd flags fname misc/ ], + }, + useradd => { + required => [ qw/name passwd flags/ ], + optional => [ qw/fname misc/ ], + status => 200, + }, + userdel => { + required => [ qw/name/ ], + status => 200, + }, + nodelist => { + status => 200, + returns => [ qw/name label doc_num word_num size/ ], + }, + nodeadd => { + required => [ qw/name/ ], + optional => [ qw/label/ ], + status => 200, + }, + nodedel => { + required => [ qw/name/ ], + status => 200, + }, + nodeclr => { + required => [ qw/name/ ], + status => 200, + }, + nodertt => { + status => 200, + }, +}; + +sub master { + my $self = shift; + + my $args = {@_}; + + # have action? + my $action = $args->{action} || croak "need action, available: ", + join(", ",keys %{ $estmaster_rest }); + + # check if action is valid + my $rest = $estmaster_rest->{$action}; + croak "action '$action' is not supported, available actions: ", + join(", ",keys %{ $estmaster_rest }) unless ($rest); + + croak "BUG: action '$action' needs return status" unless ($rest->{status}); + + my @args; + + if ($rest->{required} || $rest->{optional}) { + + map { + croak "need parametar '$_' for action '$action'" unless ($args->{$_}); + push @args, $_ . '=' . uri_escape( $args->{$_} ); + } ( @{ $rest->{required} } ); + + map { + push @args, $_ . '=' . uri_escape( $args->{$_} ) if ($args->{$_}); + } ( @{ $rest->{optional} } ); + + } + + my $uri = new URI( $self->{url} ); + + my $resbody; + + my $status = $self->shuttle_url( + 'http://' . $uri->host_port . '/master?action=' . $action , + 'application/x-www-form-urlencoded', + join('&', @args), + \$resbody, + 1, + ) or confess "shuttle_url failed"; + + if ($status == $rest->{status}) { + + # refresh node info after sync + $self->_clear_info if ($action eq 'sync' || $action =~ m/^node(?:add|del|clr)$/); + + if ($rest->{returns} && wantarray) { + + my @results; + my $fields = $#{$rest->{returns}}; + + foreach my $line ( split(/[\r\n]/,$resbody) ) { + my @e = split(/\t/, $line, $fields + 1); + my $row; + foreach my $i ( 0 .. $fields) { + $row->{ $rest->{returns}->[$i] } = $e[ $i ]; + } + push @results, $row; + } + + return @results; + + } elsif ($resbody) { + chomp $resbody; + return $resbody; + } else { + return 0E0; + } + } + + carp "expected status $rest->{status}, but got $status"; + return undef; +} =head1 PRIVATE METHODS @@ -1741,30 +2062,51 @@ return if ($rv != 200 || !$resbody); my @lines = split(/[\r\n]/,$resbody); - - ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) = - split(/\t/, shift @lines, 5); + + $self->_clear_info; + + ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum}, + $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5); return $resbody unless (@lines); shift @lines; while(my $admin = shift @lines) { - push @{$self->{admins}}, $admin; + push @{$self->{inform}->{admins}}, $admin; } - + while(my $guest = shift @lines) { - push @{$self->{guests}}, $guest; + push @{$self->{inform}->{guests}}, $guest; } while(my $link = shift @lines) { - push @{$self->{links}}, $link; + push @{$self->{inform}->{links}}, $link; } return $resbody; } +=head2 _clear_info + +Clear information for node + + $node->_clear_info; + +On next call to C, C