/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 59 by dpavlin, Fri Jan 6 23:29:58 2006 UTC revision 108 by dpavlin, Sun Feb 19 17:13:57 2006 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.04_2';
8    
9  =head1 NAME  =head1 NAME
10    
# Line 12  Search::Estraier - pure perl module to u Line 12  Search::Estraier - pure perl module to u
12    
13  =head1 SYNOPSIS  =head1 SYNOPSIS
14    
15    use Search::Estraier;  =head2 Simple indexer
16    my $est = new Search::Estraier();  
17            use Search::Estraier;
18    
19            # create and configure node
20            my $node = new Search::Estraier::Node(
21                    url => 'http://localhost:1978/node/test',
22                    user => 'admin',
23                    passwd => 'admin'
24            );
25    
26            # create document
27            my $doc = new Search::Estraier::Document;
28    
29            # add attributes
30            $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31            $doc->add_attr('@title', "Over the Rainbow");
32    
33            # add body text to document
34            $doc->add_text("Somewhere over the rainbow.  Way up high.");
35            $doc->add_text("There's a land that I heard of once in a lullaby.");
36    
37            die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38    
39    =head2 Simple searcher
40    
41            use Search::Estraier;
42    
43            # create and configure node
44            my $node = new Search::Estraier::Node(
45                    url => 'http://localhost:1978/node/test',
46                    user => 'admin',
47                    passwd => 'admin',
48                    croak_on_error => 1,
49            );
50    
51            # create condition
52            my $cond = new Search::Estraier::Condition;
53    
54            # set search phrase
55            $cond->set_phrase("rainbow AND lullaby");
56    
57            my $nres = $node->search($cond, 0);
58    
59            if (defined($nres)) {
60                    print "Got ", $nres->hits, " results\n";
61    
62                    # for each document in results
63                    for my $i ( 0 ... $nres->doc_num - 1 ) {
64                            # get result document
65                            my $rdoc = $nres->get_doc($i);
66                            # display attribte
67                            print "URI: ", $rdoc->attr('@uri'),"\n";
68                            print "Title: ", $rdoc->attr('@title'),"\n";
69                            print $rdoc->snippet,"\n";
70                    }
71            } else {
72                    die "error: ", $node->status,"\n";
73            }
74    
75  =head1 DESCRIPTION  =head1 DESCRIPTION
76    
# Line 25  or Hyper Estraier development files on t Line 82  or Hyper Estraier development files on t
82  It is implemented as multiple packages which closly resamble Ruby  It is implemented as multiple packages which closly resamble Ruby
83  implementation. It also includes methods to manage nodes.  implementation. It also includes methods to manage nodes.
84    
85    There are few examples in C<scripts> directory of this distribution.
86    
87  =cut  =cut
88    
89  =head1 Inheritable common methods  =head1 Inheritable common methods
# Line 41  Remove multiple whitespaces from string, Line 100  Remove multiple whitespaces from string,
100  =cut  =cut
101    
102  sub _s {  sub _s {
103          my $text = $_[1] || return;          my $text = $_[1];
104            return unless defined($text);
105          $text =~ s/\s\s+/ /gs;          $text =~ s/\s\s+/ /gs;
106          $text =~ s/^\s+//;          $text =~ s/^\s+//;
107          $text =~ s/\s+$//;          $text =~ s/\s+$//;
# Line 106  sub new { Line 166  sub new {
166                          } elsif ($line =~ m/^$/) {                          } elsif ($line =~ m/^$/) {
167                                  $in_text = 1;                                  $in_text = 1;
168                                  next;                                  next;
169                          } elsif ($line =~ m/^(.+)=(.+)$/) {                          } elsif ($line =~ m/^(.+)=(.*)$/) {
170                                  $self->{attrs}->{ $1 } = $2;                                  $self->{attrs}->{ $1 } = $2;
171                                  next;                                  next;
172                          }                          }
173    
174                          warn "draft ignored: $line\n";                          warn "draft ignored: '$line'\n";
175                  }                  }
176          }          }
177    
# Line 205  Returns array with attribute names from Line 265  Returns array with attribute names from
265    
266  sub attr_names {  sub attr_names {
267          my $self = shift;          my $self = shift;
268          croak "attr_names return array, not scalar" if (! wantarray);          return unless ($self->{attrs});
269            #croak "attr_names return array, not scalar" if (! wantarray);
270          return sort keys %{ $self->{attrs} };          return sort keys %{ $self->{attrs} };
271  }  }
272    
# Line 221  Returns value of an attribute. Line 282  Returns value of an attribute.
282  sub attr {  sub attr {
283          my $self = shift;          my $self = shift;
284          my $name = shift;          my $name = shift;
285            return unless (defined($name) && $self->{attrs});
286          return $self->{'attrs'}->{ $name };          return $self->{attrs}->{ $name };
287  }  }
288    
289    
# Line 236  Returns array with text sentences. Line 297  Returns array with text sentences.
297    
298  sub texts {  sub texts {
299          my $self = shift;          my $self = shift;
300          confess "texts return array, not scalar" if (! wantarray);          #confess "texts return array, not scalar" if (! wantarray);
301          return @{ $self->{dtexts} };          return @{ $self->{dtexts} } if ($self->{dtexts});
302  }  }
303    
304    
# Line 251  Return whole text as single scalar. Line 312  Return whole text as single scalar.
312    
313  sub cat_texts {  sub cat_texts {
314          my $self = shift;          my $self = shift;
315          return join(' ',@{ $self->{dtexts} });          return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316  }  }
317    
318    
# Line 268  sub dump_draft { Line 329  sub dump_draft {
329          my $draft;          my $draft;
330    
331          foreach my $attr_name (sort keys %{ $self->{attrs} }) {          foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332                  $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";                  next unless defined(my $v = $self->{attrs}->{$attr_name});
333                    $draft .= $attr_name . '=' . $v . "\n";
334          }          }
335    
336          if ($self->{kwords}) {          if ($self->{kwords}) {
# Line 316  sub delete { Line 378  sub delete {
378    
379  package Search::Estraier::Condition;  package Search::Estraier::Condition;
380    
381  use Carp qw/confess croak/;  use Carp qw/carp confess croak/;
382    
383  use Search::Estraier;  use Search::Estraier;
384  our @ISA = qw/Search::Estraier/;  our @ISA = qw/Search::Estraier/;
# Line 394  sub set_max { Line 456  sub set_max {
456    
457  =head2 set_options  =head2 set_options
458    
459    $cond->set_options( SURE => 1 );    $cond->set_options( 'SURE' );
460    
461      $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462    
463    Possible options are:
464    
465    =over 8
466    
467    =item SURE
468    
469    check every N-gram
470    
471    =item USUAL
472    
473    check every second N-gram
474    
475    =item FAST
476    
477    check every third N-gram
478    
479    =item AGITO
480    
481    check every fourth N-gram
482    
483    =item NOIDF
484    
485    don't perform TF-IDF tuning
486    
487    =item SIMPLE
488    
489    use simplified query phrase
490    
491    =back
492    
493    Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494    options;
495    
496    This option changed in version C<0.04> of this module. It's backwards compatibile.
497    
498  =cut  =cut
499    
500  my $options = {  my $options = {
         # check N-gram keys skipping by three  
501          SURE => 1 << 0,          SURE => 1 << 0,
         # check N-gram keys skipping by two  
502          USUAL => 1 << 1,          USUAL => 1 << 1,
         # without TF-IDF tuning  
503          FAST => 1 << 2,          FAST => 1 << 2,
         # with the simplified phrase  
504          AGITO => 1 << 3,          AGITO => 1 << 3,
         # check every N-gram key  
505          NOIDF => 1 << 4,          NOIDF => 1 << 4,
         # check N-gram keys skipping by one  
506          SIMPLE => 1 << 10,          SIMPLE => 1 << 10,
507  };  };
508    
509  sub set_options {  sub set_options {
510          my $self = shift;          my $self = shift;
511          my $option = shift;          my $opt = 0;
512          confess "unknown option" unless ($options->{$option});          foreach my $option (@_) {
513          $self->{options} ||= $options->{$option};                  my $mask;
514                    unless ($mask = $options->{$option}) {
515                            if ($option eq '1') {
516                                    next;
517                            } else {
518                                    croak "unknown option $option";
519                            }
520                    }
521                    $opt += $mask;
522            }
523            $self->{options} = $opt;
524  }  }
525    
526    
# Line 460  Return search result attrs. Line 563  Return search result attrs.
563  sub attrs {  sub attrs {
564          my $self = shift;          my $self = shift;
565          #croak "attrs return array, not scalar" if (! wantarray);          #croak "attrs return array, not scalar" if (! wantarray);
566          return @{ $self->{attrs} };          return @{ $self->{attrs} } if ($self->{attrs});
567  }  }
568    
569    
# Line 524  sub new { Line 627  sub new {
627          my $self = {@_};          my $self = {@_};
628          bless($self, $class);          bless($self, $class);
629    
630          foreach my $f (qw/uri attrs snippet keywords/) {          croak "missing uri for ResultDocument" unless defined($self->{uri});
                 croak "missing $f for ResultDocument" unless defined($self->{$f});  
         }  
631    
632          $self ? return $self : return undef;          $self ? return $self : return undef;
633  }  }
# Line 641  Return number of documents Line 742  Return number of documents
742    
743    print $res->doc_num;    print $res->doc_num;
744    
745    This will return real number of documents (limited by C<max>).
746    If you want to get total number of hits, see C<hits>.
747    
748  =cut  =cut
749    
750  sub doc_num {  sub doc_num {
# Line 672  sub get_doc { Line 776  sub get_doc {
776    
777  Return specific hint from results.  Return specific hint from results.
778    
779    print $rec->hint( 'VERSION' );    print $res->hint( 'VERSION' );
780    
781  Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,  Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
782  C<TIME>, C<LINK#n>, C<VIEW>.  C<TIME>, C<LINK#n>, C<VIEW>.
# Line 685  sub hint { Line 789  sub hint {
789          return $self->{hints}->{$key};          return $self->{hints}->{$key};
790  }  }
791    
792    =head2 hints
793    
794    More perlish version of C<hint>. This one returns hash.
795    
796      my %hints = $res->hints;
797    
798    =cut
799    
800    sub hints {
801            my $self = shift;
802            return $self->{hints};
803    }
804    
805    =head2 hits
806    
807    Syntaxtic sugar for total number of hits for this query
808    
809      print $res->hits;
810    
811    It's same as
812    
813      print $res->hint('HIT');
814    
815    but shorter.
816    
817    =cut
818    
819    sub hits {
820            my $self = shift;
821            return $self->{hints}->{'HIT'} || 0;
822    }
823    
824  package Search::Estraier::Node;  package Search::Estraier::Node;
825    
# Line 700  use URI::Escape qw/uri_escape/; Line 835  use URI::Escape qw/uri_escape/;
835    
836    my $node = new Search::HyperEstraier::Node;    my $node = new Search::HyperEstraier::Node;
837    
838    or optionally with C<url> as parametar
839    
840      my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
841    
842    or in more verbose form
843    
844      my $node = new Search::HyperEstraier::Node(
845            url => 'http://localhost:1978/node/test',
846            debug => 1,
847            croak_on_error => 1
848      );
849    
850    with following arguments:
851    
852    =over 4
853    
854    =item url
855    
856    URL to node
857    
858    =item debug
859    
860    dumps a B<lot> of debugging output
861    
862    =item croak_on_error
863    
864    very helpful during development. It will croak on all errors instead of
865    silently returning C<-1> (which is convention of Hyper Estraier API in other
866    languages).
867    
868    =back
869    
870  =cut  =cut
871    
872  sub new {  sub new {
# Line 717  sub new { Line 884  sub new {
884          };          };
885          bless($self, $class);          bless($self, $class);
886    
887          my $args = {@_};          if ($#_ == 0) {
888                    $self->{url} = shift;
889            } else {
890                    my $args = {@_};
891    
892          $self->{debug} = $args->{debug};                  %$self = ( %$self, @_ );
893          warn "## Node debug on\n" if ($self->{debug});  
894                    warn "## Node debug on\n" if ($self->{debug});
895            }
896    
897          $self ? return $self : return undef;          $self ? return $self : return undef;
898  }  }
# Line 1000  Get ID of document specified by URI Line 1172  Get ID of document specified by URI
1172    
1173    my $id = $node->uri_to_id( 'file:///document/uri/42' );    my $id = $node->uri_to_id( 'file:///document/uri/42' );
1174    
1175    This method won't croak, even if using C<croak_on_error>.
1176    
1177  =cut  =cut
1178    
1179  sub uri_to_id {  sub uri_to_id {
1180          my $self = shift;          my $self = shift;
1181          my $uri = shift || return;          my $uri = shift || return;
1182          return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );          return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1183  }  }
1184    
1185    
# Line 1065  sub _fetch_doc { Line 1239  sub _fetch_doc {
1239                  'application/x-www-form-urlencoded',                  'application/x-www-form-urlencoded',
1240                  $arg,                  $arg,
1241                  \$resbody,                  \$resbody,
1242                    $a->{croak_on_error},
1243          );          );
1244    
1245          return if ($rv != 200);          return if ($rv != 200);
# Line 1176  sub search { Line 1351  sub search {
1351    
1352          my $rv = $self->shuttle_url( $self->{url} . '/search',          my $rv = $self->shuttle_url( $self->{url} . '/search',
1353                  'application/x-www-form-urlencoded',                  'application/x-www-form-urlencoded',
1354                  $self->cond_to_query( $cond ),                  $self->cond_to_query( $cond, $depth ),
1355                  \$resbody,                  \$resbody,
1356          );          );
1357          return if ($rv != 200);          return if ($rv != 200);
# Line 1270  sub search { Line 1445  sub search {
1445    
1446  Return URI encoded string generated from Search::Estraier::Condition  Return URI encoded string generated from Search::Estraier::Condition
1447    
1448    my $args = $node->cond_to_query( $cond );    my $args = $node->cond_to_query( $cond, $depth );
1449    
1450  =cut  =cut
1451    
# Line 1279  sub cond_to_query { Line 1454  sub cond_to_query {
1454    
1455          my $cond = shift || return;          my $cond = shift || return;
1456          croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));          croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1457            my $depth = shift;
1458    
1459          my @args;          my @args;
1460    
# Line 1288  sub cond_to_query { Line 1464  sub cond_to_query {
1464    
1465          if (my @attrs = $cond->attrs) {          if (my @attrs = $cond->attrs) {
1466                  for my $i ( 0 .. $#attrs ) {                  for my $i ( 0 .. $#attrs ) {
1467                          push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );                          push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1468                  }                  }
1469          }          }
1470    
# Line 1306  sub cond_to_query { Line 1482  sub cond_to_query {
1482                  push @args, 'options=' . $options;                  push @args, 'options=' . $options;
1483          }          }
1484    
1485          push @args, 'depth=' . $self->{depth} if ($self->{depth});          push @args, 'depth=' . $depth if ($depth);
1486          push @args, 'wwidth=' . $self->{wwidth};          push @args, 'wwidth=' . $self->{wwidth};
1487          push @args, 'hwidth=' . $self->{hwidth};          push @args, 'hwidth=' . $self->{hwidth};
1488          push @args, 'awidth=' . $self->{awidth};          push @args, 'awidth=' . $self->{awidth};
# Line 1317  sub cond_to_query { Line 1493  sub cond_to_query {
1493    
1494  =head2 shuttle_url  =head2 shuttle_url
1495    
1496  This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node  This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1497  master.  master.
1498    
1499    my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );    my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
# Line 1332  use LWP::UserAgent; Line 1508  use LWP::UserAgent;
1508  sub shuttle_url {  sub shuttle_url {
1509          my $self = shift;          my $self = shift;
1510    
1511          my ($url, $content_type, $reqbody, $resbody) = @_;          my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1512    
1513            $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1514    
1515          $self->{status} = -1;          $self->{status} = -1;
1516    
# Line 1359  sub shuttle_url { Line 1537  sub shuttle_url {
1537    
1538          $req->headers->header( 'Host' => $url->host . ":" . $url->port );          $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1539          $req->headers->header( 'Connection', 'close' );          $req->headers->header( 'Connection', 'close' );
1540          $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );          $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1541          $req->content_type( $content_type );          $req->content_type( $content_type );
1542    
1543          warn $req->headers->as_string,"\n" if ($self->{debug});          warn $req->headers->as_string,"\n" if ($self->{debug});
# Line 1373  sub shuttle_url { Line 1551  sub shuttle_url {
1551    
1552          warn "## response status: ",$res->status_line,"\n" if ($self->{debug});          warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1553    
         return -1 if (! $res->is_success);  
   
1554          ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);          ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1555    
1556            if (! $res->is_success) {
1557                    if ($croak_on_error) {
1558                            croak("can't get $url: ",$res->status_line);
1559                    } else {
1560                            return -1;
1561                    }
1562            }
1563    
1564          $$resbody .= $res->content;          $$resbody .= $res->content;
1565    
1566          warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});          warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
# Line 1475  sub set_link { Line 1659  sub set_link {
1659          my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);          my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1660          $reqbody .= '&credit=' . $credit if ($credit > 0);          $reqbody .= '&credit=' . $credit if ($credit > 0);
1661    
1662          $self->shuttle_url( $self->{url} . '/_set_link',          if ($self->shuttle_url( $self->{url} . '/_set_link',
1663                  'text/plain',                  'application/x-www-form-urlencoded',
1664                  $reqbody,                  $reqbody,
1665                  undef                  undef
1666          ) == 200;          ) == 200) {
1667                    # refresh node info after adding link
1668                    $self->_set_info;
1669                    return 1;
1670            }
1671    }
1672    
1673    =head2 admins
1674    
1675     my @admins = @{ $node->admins };
1676    
1677    Return array of users with admin rights on node
1678    
1679    =cut
1680    
1681    sub admins {
1682            my $self = shift;
1683            $self->_set_info unless ($self->{name});
1684            return $self->{admins};
1685    }
1686    
1687    =head2 guests
1688    
1689     my @guests = @{ $node->guests };
1690    
1691    Return array of users with guest rights on node
1692    
1693    =cut
1694    
1695    sub guests {
1696            my $self = shift;
1697            $self->_set_info unless ($self->{name});
1698            return $self->{guests};
1699    }
1700    
1701    =head2 links
1702    
1703     my $links = @{ $node->links };
1704    
1705    Return array of links for this node
1706    
1707    =cut
1708    
1709    sub links {
1710            my $self = shift;
1711            $self->_set_info unless ($self->{name});
1712            return $self->{links};
1713  }  }
1714    
1715    
# Line 1510  sub _set_info { Line 1740  sub _set_info {
1740    
1741          return if ($rv != 200 || !$resbody);          return if ($rv != 200 || !$resbody);
1742    
1743          # it seems that response can have multiple line endings          my @lines = split(/[\r\n]/,$resbody);
1744          $resbody =~ s/[\r\n]+$//;          
   
1745          ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =          ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1746                  split(/\t/, $resbody, 5);                  split(/\t/, shift @lines, 5);
1747    
1748            return $resbody unless (@lines);
1749    
1750            shift @lines;
1751    
1752            while(my $admin = shift @lines) {
1753                    push @{$self->{admins}}, $admin;
1754            }
1755            
1756            while(my $guest = shift @lines) {
1757                    push @{$self->{guests}}, $guest;
1758            }
1759    
1760            while(my $link = shift @lines) {
1761                    push @{$self->{links}}, $link;
1762            }
1763    
1764            return $resbody;
1765    
1766  }  }
1767    

Legend:
Removed from v.59  
changed lines
  Added in v.108

  ViewVC Help
Powered by ViewVC 1.1.26