/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 81 by dpavlin, Tue Jan 17 00:03:45 2006 UTC revision 122 by dpavlin, Tue May 2 10:19:47 2006 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.04_1';  our $VERSION = '0.05_1';
8    
9  =head1 NAME  =head1 NAME
10    
# Line 17  Search::Estraier - pure perl module to u Line 17  Search::Estraier - pure perl module to u
17          use Search::Estraier;          use Search::Estraier;
18    
19          # create and configure node          # create and configure node
20          my $node = new Search::Estraier::Node;          my $node = new Search::Estraier::Node(
21          $node->set_url("http://localhost:1978/node/test");                  url => 'http://localhost:1978/node/test',
22          $node->set_auth("admin","admin");                  user => 'admin',
23                    passwd => 'admin'
24            );
25    
26          # create document          # create document
27          my $doc = new Search::Estraier::Document;          my $doc = new Search::Estraier::Document;
# Line 32  Search::Estraier - pure perl module to u Line 34  Search::Estraier - pure perl module to u
34          $doc->add_text("Somewhere over the rainbow.  Way up high.");          $doc->add_text("Somewhere over the rainbow.  Way up high.");
35          $doc->add_text("There's a land that I heard of once in a lullaby.");          $doc->add_text("There's a land that I heard of once in a lullaby.");
36    
37          die "error: ", $node->status,"\n" unless ($node->put_doc($doc));          die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38    
39  =head2 Simple searcher  =head2 Simple searcher
40    
41          use Search::Estraier;          use Search::Estraier;
42    
43          # create and configure node          # create and configure node
44          my $node = new Search::Estraier::Node;          my $node = new Search::Estraier::Node(
45          $node->set_url("http://localhost:1978/node/test");                  url => 'http://localhost:1978/node/test',
46          $node->set_auth("admin","admin");                  user => 'admin',
47                    passwd => 'admin',
48                    croak_on_error => 1,
49            );
50    
51          # create condition          # create condition
52          my $cond = new Search::Estraier::Condition;          my $cond = new Search::Estraier::Condition;
# Line 50  Search::Estraier - pure perl module to u Line 55  Search::Estraier - pure perl module to u
55          $cond->set_phrase("rainbow AND lullaby");          $cond->set_phrase("rainbow AND lullaby");
56    
57          my $nres = $node->search($cond, 0);          my $nres = $node->search($cond, 0);
58    
59          if (defined($nres)) {          if (defined($nres)) {
60                    print "Got ", $nres->hits, " results\n";
61    
62                  # for each document in results                  # for each document in results
63                  for my $i ( 0 ... $nres->doc_num - 1 ) {                  for my $i ( 0 ... $nres->doc_num - 1 ) {
64                          # get result document                          # get result document
# Line 92  Remove multiple whitespaces from string, Line 100  Remove multiple whitespaces from string,
100  =cut  =cut
101    
102  sub _s {  sub _s {
103          my $text = $_[1] || return;          my $text = $_[1];
104            return unless defined($text);
105          $text =~ s/\s\s+/ /gs;          $text =~ s/\s\s+/ /gs;
106          $text =~ s/^\s+//;          $text =~ s/^\s+//;
107          $text =~ s/\s+$//;          $text =~ s/\s+$//;
# Line 320  sub dump_draft { Line 329  sub dump_draft {
329          my $draft;          my $draft;
330    
331          foreach my $attr_name (sort keys %{ $self->{attrs} }) {          foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332                  $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";                  next unless defined(my $v = $self->{attrs}->{$attr_name});
333                    $draft .= $attr_name . '=' . $v . "\n";
334          }          }
335    
336          if ($self->{kwords}) {          if ($self->{kwords}) {
# Line 368  sub delete { Line 378  sub delete {
378    
379  package Search::Estraier::Condition;  package Search::Estraier::Condition;
380    
381  use Carp qw/confess croak/;  use Carp qw/carp confess croak/;
382    
383  use Search::Estraier;  use Search::Estraier;
384  our @ISA = qw/Search::Estraier/;  our @ISA = qw/Search::Estraier/;
# Line 446  sub set_max { Line 456  sub set_max {
456    
457  =head2 set_options  =head2 set_options
458    
459    $cond->set_options( SURE => 1 );    $cond->set_options( 'SURE' );
460    
461      $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462    
463    Possible options are:
464    
465    =over 8
466    
467    =item SURE
468    
469    check every N-gram
470    
471    =item USUAL
472    
473    check every second N-gram
474    
475    =item FAST
476    
477    check every third N-gram
478    
479    =item AGITO
480    
481    check every fourth N-gram
482    
483    =item NOIDF
484    
485    don't perform TF-IDF tuning
486    
487    =item SIMPLE
488    
489    use simplified query phrase
490    
491    =back
492    
493    Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494    options;
495    
496    This option changed in version C<0.04> of this module. It's backwards compatibile.
497    
498  =cut  =cut
499    
500  my $options = {  my $options = {
         # check N-gram keys skipping by three  
501          SURE => 1 << 0,          SURE => 1 << 0,
         # check N-gram keys skipping by two  
502          USUAL => 1 << 1,          USUAL => 1 << 1,
         # without TF-IDF tuning  
503          FAST => 1 << 2,          FAST => 1 << 2,
         # with the simplified phrase  
504          AGITO => 1 << 3,          AGITO => 1 << 3,
         # check every N-gram key  
505          NOIDF => 1 << 4,          NOIDF => 1 << 4,
         # check N-gram keys skipping by one  
506          SIMPLE => 1 << 10,          SIMPLE => 1 << 10,
507  };  };
508    
509  sub set_options {  sub set_options {
510          my $self = shift;          my $self = shift;
511          my $option = shift;          my $opt = 0;
512          confess "unknown option" unless ($options->{$option});          foreach my $option (@_) {
513          $self->{options} ||= $options->{$option};                  my $mask;
514                    unless ($mask = $options->{$option}) {
515                            if ($option eq '1') {
516                                    next;
517                            } else {
518                                    croak "unknown option $option";
519                            }
520                    }
521                    $opt += $mask;
522            }
523            $self->{options} = $opt;
524  }  }
525    
526    
# Line 548  sub options { Line 599  sub options {
599  }  }
600    
601    
602    =head2 set_skip
603    
604    Set number of skipped documents from beginning of results
605    
606      $cond->set_skip(42);
607    
608    Similar to C<offset> in RDBMS.
609    
610    =cut
611    
612    sub set_skip {
613            my $self = shift;
614            $self->{skip} = shift;
615    }
616    
617    =head2 skip
618    
619    Return skip for this condition.
620    
621      print $cond->skip;
622    
623    =cut
624    
625    sub skip {
626            my $self = shift;
627            return $self->{skip};
628    }
629    
630    
631  package Search::Estraier::ResultDocument;  package Search::Estraier::ResultDocument;
632    
633  use Carp qw/croak/;  use Carp qw/croak/;
# Line 691  Return number of documents Line 771  Return number of documents
771    
772    print $res->doc_num;    print $res->doc_num;
773    
774    This will return real number of documents (limited by C<max>).
775    If you want to get total number of hits, see C<hits>.
776    
777  =cut  =cut
778    
779  sub doc_num {  sub doc_num {
# Line 722  sub get_doc { Line 805  sub get_doc {
805    
806  Return specific hint from results.  Return specific hint from results.
807    
808    print $rec->hint( 'VERSION' );    print $res->hint( 'VERSION' );
809    
810  Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,  Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
811  C<TIME>, C<LINK#n>, C<VIEW>.  C<TIME>, C<LINK#n>, C<VIEW>.
# Line 735  sub hint { Line 818  sub hint {
818          return $self->{hints}->{$key};          return $self->{hints}->{$key};
819  }  }
820    
821    =head2 hints
822    
823    More perlish version of C<hint>. This one returns hash.
824    
825      my %hints = $res->hints;
826    
827    =cut
828    
829    sub hints {
830            my $self = shift;
831            return $self->{hints};
832    }
833    
834    =head2 hits
835    
836    Syntaxtic sugar for total number of hits for this query
837    
838      print $res->hits;
839    
840    It's same as
841    
842      print $res->hint('HIT');
843    
844    but shorter.
845    
846    =cut
847    
848    sub hits {
849            my $self = shift;
850            return $self->{hints}->{'HIT'} || 0;
851    }
852    
853  package Search::Estraier::Node;  package Search::Estraier::Node;
854    
# Line 789  sub new { Line 903  sub new {
903          my $self = {          my $self = {
904                  pxport => -1,                  pxport => -1,
905                  timeout => 0,   # this used to be -1                  timeout => 0,   # this used to be -1
                 dnum => -1,  
                 wnum => -1,  
                 size => -1.0,  
906                  wwidth => 480,                  wwidth => 480,
907                  hwidth => 96,                  hwidth => 96,
908                  awidth => 96,                  awidth => 96,
909                  status => -1,                  status => -1,
910          };          };
911    
912          bless($self, $class);          bless($self, $class);
913    
914          if ($#_ == 0) {          if ($#_ == 0) {
# Line 809  sub new { Line 921  sub new {
921                  warn "## Node debug on\n" if ($self->{debug});                  warn "## Node debug on\n" if ($self->{debug});
922          }          }
923    
924            $self->{inform} = {
925                    dnum => -1,
926                    wnum => -1,
927                    size => -1.0,
928            };
929    
930          $self ? return $self : return undef;          $self ? return $self : return undef;
931  }  }
932    
# Line 1087  Get ID of document specified by URI Line 1205  Get ID of document specified by URI
1205    
1206    my $id = $node->uri_to_id( 'file:///document/uri/42' );    my $id = $node->uri_to_id( 'file:///document/uri/42' );
1207    
1208    This method won't croak, even if using C<croak_on_error>.
1209    
1210  =cut  =cut
1211    
1212  sub uri_to_id {  sub uri_to_id {
1213          my $self = shift;          my $self = shift;
1214          my $uri = shift || return;          my $uri = shift || return;
1215          return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );          return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1216  }  }
1217    
1218    
# Line 1152  sub _fetch_doc { Line 1272  sub _fetch_doc {
1272                  'application/x-www-form-urlencoded',                  'application/x-www-form-urlencoded',
1273                  $arg,                  $arg,
1274                  \$resbody,                  \$resbody,
1275                    $a->{croak_on_error},
1276          );          );
1277    
1278          return if ($rv != 200);          return if ($rv != 200);
# Line 1182  sub _fetch_doc { Line 1303  sub _fetch_doc {
1303    
1304  sub name {  sub name {
1305          my $self = shift;          my $self = shift;
1306          $self->_set_info unless ($self->{name});          $self->_set_info unless ($self->{inform}->{name});
1307          return $self->{name};          return $self->{inform}->{name};
1308  }  }
1309    
1310    
# Line 1195  sub name { Line 1316  sub name {
1316    
1317  sub label {  sub label {
1318          my $self = shift;          my $self = shift;
1319          $self->_set_info unless ($self->{label});          $self->_set_info unless ($self->{inform}->{label});
1320          return $self->{label};          return $self->{inform}->{label};
1321  }  }
1322    
1323    
# Line 1208  sub label { Line 1329  sub label {
1329    
1330  sub doc_num {  sub doc_num {
1331          my $self = shift;          my $self = shift;
1332          $self->_set_info if ($self->{dnum} < 0);          $self->_set_info if ($self->{inform}->{dnum} < 0);
1333          return $self->{dnum};          return $self->{inform}->{dnum};
1334  }  }
1335    
1336    
# Line 1221  sub doc_num { Line 1342  sub doc_num {
1342    
1343  sub word_num {  sub word_num {
1344          my $self = shift;          my $self = shift;
1345          $self->_set_info if ($self->{wnum} < 0);          $self->_set_info if ($self->{inform}->{wnum} < 0);
1346          return $self->{wnum};          return $self->{inform}->{wnum};
1347  }  }
1348    
1349    
# Line 1234  sub word_num { Line 1355  sub word_num {
1355    
1356  sub size {  sub size {
1357          my $self = shift;          my $self = shift;
1358          $self->_set_info if ($self->{size} < 0);          $self->_set_info if ($self->{inform}->{size} < 0);
1359          return $self->{size};          return $self->{inform}->{size};
1360  }  }
1361    
1362    
# Line 1398  sub cond_to_query { Line 1519  sub cond_to_query {
1519          push @args, 'wwidth=' . $self->{wwidth};          push @args, 'wwidth=' . $self->{wwidth};
1520          push @args, 'hwidth=' . $self->{hwidth};          push @args, 'hwidth=' . $self->{hwidth};
1521          push @args, 'awidth=' . $self->{awidth};          push @args, 'awidth=' . $self->{awidth};
1522            push @args, 'skip=' . $self->{skip} if ($self->{skip});
1523    
1524          return join('&', @args);          return join('&', @args);
1525  }  }
# Line 1420  use LWP::UserAgent; Line 1542  use LWP::UserAgent;
1542  sub shuttle_url {  sub shuttle_url {
1543          my $self = shift;          my $self = shift;
1544    
1545          my ($url, $content_type, $reqbody, $resbody) = @_;          my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1546    
1547            $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1548    
1549          $self->{status} = -1;          $self->{status} = -1;
1550    
# Line 1464  sub shuttle_url { Line 1588  sub shuttle_url {
1588          ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);          ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1589    
1590          if (! $res->is_success) {          if (! $res->is_success) {
1591                  if ($self->{croak_on_error}) {                  if ($croak_on_error) {
1592                          croak("can't get $url: ",$res->status_line);                          croak("can't get $url: ",$res->status_line);
1593                  } else {                  } else {
1594                          return -1;                          return -1;
# Line 1569  sub set_link { Line 1693  sub set_link {
1693          my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);          my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1694          $reqbody .= '&credit=' . $credit if ($credit > 0);          $reqbody .= '&credit=' . $credit if ($credit > 0);
1695    
1696          $self->shuttle_url( $self->{url} . '/_set_link',          if ($self->shuttle_url( $self->{url} . '/_set_link',
1697                  'application/x-www-form-urlencoded',                  'application/x-www-form-urlencoded',
1698                  $reqbody,                  $reqbody,
1699                  undef                  undef
1700          ) == 200;          ) == 200) {
1701                    # refresh node info after adding link
1702                    $self->_set_info;
1703                    return 1;
1704            }
1705    }
1706    
1707    =head2 admins
1708    
1709     my @admins = @{ $node->admins };
1710    
1711    Return array of users with admin rights on node
1712    
1713    =cut
1714    
1715    sub admins {
1716            my $self = shift;
1717            $self->_set_info unless ($self->{inform}->{name});
1718            return $self->{inform}->{admins};
1719    }
1720    
1721    =head2 guests
1722    
1723     my @guests = @{ $node->guests };
1724    
1725    Return array of users with guest rights on node
1726    
1727    =cut
1728    
1729    sub guests {
1730            my $self = shift;
1731            $self->_set_info unless ($self->{inform}->{name});
1732            return $self->{inform}->{guests};
1733    }
1734    
1735    =head2 links
1736    
1737     my $links = @{ $node->links };
1738    
1739    Return array of links for this node
1740    
1741    =cut
1742    
1743    sub links {
1744            my $self = shift;
1745            $self->_set_info unless ($self->{inform}->{name});
1746            return $self->{inform}->{links};
1747  }  }
1748    
1749    
# Line 1604  sub _set_info { Line 1774  sub _set_info {
1774    
1775          return if ($rv != 200 || !$resbody);          return if ($rv != 200 || !$resbody);
1776    
1777          # it seems that response can have multiple line endings          my @lines = split(/[\r\n]/,$resbody);
1778          $resbody =~ s/[\r\n]+$//;  
1779            $self->{inform} = {};
1780    
1781            ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1782                    $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1783    
1784            return $resbody unless (@lines);
1785    
1786            shift @lines;
1787    
1788            while(my $admin = shift @lines) {
1789                    push @{$self->{inform}->{admins}}, $admin;
1790            }
1791    
1792            while(my $guest = shift @lines) {
1793                    push @{$self->{inform}->{guests}}, $guest;
1794            }
1795    
1796            while(my $link = shift @lines) {
1797                    push @{$self->{inform}->{links}}, $link;
1798            }
1799    
1800          ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =          return $resbody;
                 split(/\t/, $resbody, 5);  
1801    
1802  }  }
1803    

Legend:
Removed from v.81  
changed lines
  Added in v.122

  ViewVC Help
Powered by ViewVC 1.1.26