/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 53 by dpavlin, Fri Jan 6 14:39:45 2006 UTC revision 97 by dpavlin, Sat Jan 28 18:19:47 2006 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.04_1';
8    
9  =head1 NAME  =head1 NAME
10    
# Line 12  Search::Estraier - pure perl module to u Line 12  Search::Estraier - pure perl module to u
12    
13  =head1 SYNOPSIS  =head1 SYNOPSIS
14    
15    use Search::Estraier;  =head2 Simple indexer
16    my $est = new Search::Estraier();  
17            use Search::Estraier;
18    
19            # create and configure node
20            my $node = new Search::Estraier::Node;
21            $node->set_url("http://localhost:1978/node/test");
22            $node->set_auth("admin","admin");
23    
24            # create document
25            my $doc = new Search::Estraier::Document;
26    
27            # add attributes
28            $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29            $doc->add_attr('@title', "Over the Rainbow");
30    
31            # add body text to document
32            $doc->add_text("Somewhere over the rainbow.  Way up high.");
33            $doc->add_text("There's a land that I heard of once in a lullaby.");
34    
35            die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36    
37    =head2 Simple searcher
38    
39            use Search::Estraier;
40    
41            # create and configure node
42            my $node = new Search::Estraier::Node;
43            $node->set_url("http://localhost:1978/node/test");
44            $node->set_auth("admin","admin");
45    
46            # create condition
47            my $cond = new Search::Estraier::Condition;
48    
49            # set search phrase
50            $cond->set_phrase("rainbow AND lullaby");
51    
52            my $nres = $node->search($cond, 0);
53            if (defined($nres)) {
54                    # for each document in results
55                    for my $i ( 0 ... $nres->doc_num - 1 ) {
56                            # get result document
57                            my $rdoc = $nres->get_doc($i);
58                            # display attribte
59                            print "URI: ", $rdoc->attr('@uri'),"\n";
60                            print "Title: ", $rdoc->attr('@title'),"\n";
61                            print $rdoc->snippet,"\n";
62                    }
63            } else {
64                    die "error: ", $node->status,"\n";
65            }
66    
67  =head1 DESCRIPTION  =head1 DESCRIPTION
68    
# Line 25  or Hyper Estraier development files on t Line 74  or Hyper Estraier development files on t
74  It is implemented as multiple packages which closly resamble Ruby  It is implemented as multiple packages which closly resamble Ruby
75  implementation. It also includes methods to manage nodes.  implementation. It also includes methods to manage nodes.
76    
77    There are few examples in C<scripts> directory of this distribution.
78    
79  =cut  =cut
80    
81  =head1 Inheritable common methods  =head1 Inheritable common methods
# Line 41  Remove multiple whitespaces from string, Line 92  Remove multiple whitespaces from string,
92  =cut  =cut
93    
94  sub _s {  sub _s {
95          my $text = $_[1] || return;          my $text = $_[1];
96            return unless defined($text);
97          $text =~ s/\s\s+/ /gs;          $text =~ s/\s\s+/ /gs;
98          $text =~ s/^\s+//;          $text =~ s/^\s+//;
99          $text =~ s/\s+$//;          $text =~ s/\s+$//;
# Line 106  sub new { Line 158  sub new {
158                          } elsif ($line =~ m/^$/) {                          } elsif ($line =~ m/^$/) {
159                                  $in_text = 1;                                  $in_text = 1;
160                                  next;                                  next;
161                          } elsif ($line =~ m/^(.+)=(.+)$/) {                          } elsif ($line =~ m/^(.+)=(.*)$/) {
162                                  $self->{attrs}->{ $1 } = $2;                                  $self->{attrs}->{ $1 } = $2;
163                                  next;                                  next;
164                          }                          }
165    
166                          warn "draft ignored: $line\n";                          warn "draft ignored: '$line'\n";
167                  }                  }
168          }          }
169    
# Line 205  Returns array with attribute names from Line 257  Returns array with attribute names from
257    
258  sub attr_names {  sub attr_names {
259          my $self = shift;          my $self = shift;
260          croak "attr_names return array, not scalar" if (! wantarray);          return unless ($self->{attrs});
261            #croak "attr_names return array, not scalar" if (! wantarray);
262          return sort keys %{ $self->{attrs} };          return sort keys %{ $self->{attrs} };
263  }  }
264    
# Line 221  Returns value of an attribute. Line 274  Returns value of an attribute.
274  sub attr {  sub attr {
275          my $self = shift;          my $self = shift;
276          my $name = shift;          my $name = shift;
277            return unless (defined($name) && $self->{attrs});
278          return $self->{'attrs'}->{ $name };          return $self->{attrs}->{ $name };
279  }  }
280    
281    
# Line 236  Returns array with text sentences. Line 289  Returns array with text sentences.
289    
290  sub texts {  sub texts {
291          my $self = shift;          my $self = shift;
292          confess "texts return array, not scalar" if (! wantarray);          #confess "texts return array, not scalar" if (! wantarray);
293          return @{ $self->{dtexts} };          return @{ $self->{dtexts} } if ($self->{dtexts});
294  }  }
295    
296    
# Line 251  Return whole text as single scalar. Line 304  Return whole text as single scalar.
304    
305  sub cat_texts {  sub cat_texts {
306          my $self = shift;          my $self = shift;
307          return join(' ',@{ $self->{dtexts} });          return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
308  }  }
309    
310    
# Line 268  sub dump_draft { Line 321  sub dump_draft {
321          my $draft;          my $draft;
322    
323          foreach my $attr_name (sort keys %{ $self->{attrs} }) {          foreach my $attr_name (sort keys %{ $self->{attrs} }) {
324                  $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";                  next unless defined(my $v = $self->{attrs}->{$attr_name});
325                    $draft .= $attr_name . '=' . $v . "\n";
326          }          }
327    
328          if ($self->{kwords}) {          if ($self->{kwords}) {
# Line 460  Return search result attrs. Line 514  Return search result attrs.
514  sub attrs {  sub attrs {
515          my $self = shift;          my $self = shift;
516          #croak "attrs return array, not scalar" if (! wantarray);          #croak "attrs return array, not scalar" if (! wantarray);
517          return @{ $self->{attrs} };          return @{ $self->{attrs} } if ($self->{attrs});
518  }  }
519    
520    
# Line 524  sub new { Line 578  sub new {
578          my $self = {@_};          my $self = {@_};
579          bless($self, $class);          bless($self, $class);
580    
581          foreach my $f (qw/uri attrs snippet keywords/) {          croak "missing uri for ResultDocument" unless defined($self->{uri});
                 croak "missing $f for ResultDocument" unless defined($self->{$f});  
         }  
582    
583          $self ? return $self : return undef;          $self ? return $self : return undef;
584  }  }
# Line 685  sub hint { Line 737  sub hint {
737          return $self->{hints}->{$key};          return $self->{hints}->{$key};
738  }  }
739    
740    =head2 hints
741    
742    More perlish version of C<hint>. This one returns hash.
743    
744      my %hints = $rec->hints;
745    
746    =cut
747    
748    sub hints {
749            my $self = shift;
750            return $self->{hints};
751    }
752    
753  package Search::Estraier::Node;  package Search::Estraier::Node;
754    
# Line 700  use URI::Escape qw/uri_escape/; Line 764  use URI::Escape qw/uri_escape/;
764    
765    my $node = new Search::HyperEstraier::Node;    my $node = new Search::HyperEstraier::Node;
766    
767    or optionally with C<url> as parametar
768    
769      my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
770    
771    or in more verbose form
772    
773      my $node = new Search::HyperEstraier::Node(
774            url => 'http://localhost:1978/node/test',
775            debug => 1,
776            croak_on_error => 1
777      );
778    
779    with following arguments:
780    
781    =over 4
782    
783    =item url
784    
785    URL to node
786    
787    =item debug
788    
789    dumps a B<lot> of debugging output
790    
791    =item croak_on_error
792    
793    very helpful during development. It will croak on all errors instead of
794    silently returning C<-1> (which is convention of Hyper Estraier API in other
795    languages).
796    
797    =back
798    
799  =cut  =cut
800    
801  sub new {  sub new {
# Line 717  sub new { Line 813  sub new {
813          };          };
814          bless($self, $class);          bless($self, $class);
815    
816          if (@_) {          if ($#_ == 0) {
817                  $self->{debug} = shift;                  $self->{url} = shift;
818                  warn "## Node debug on\n";          } else {
819                    my $args = {@_};
820    
821                    %$self = ( %$self, @_ );
822    
823                    warn "## Node debug on\n" if ($self->{debug});
824          }          }
825    
826          $self ? return $self : return undef;          $self ? return $self : return undef;
# Line 1095  sub _fetch_doc { Line 1196  sub _fetch_doc {
1196    
1197  sub name {  sub name {
1198          my $self = shift;          my $self = shift;
1199          $self->set_info unless ($self->{name});          $self->_set_info unless ($self->{name});
1200          return $self->{name};          return $self->{name};
1201  }  }
1202    
# Line 1108  sub name { Line 1209  sub name {
1209    
1210  sub label {  sub label {
1211          my $self = shift;          my $self = shift;
1212          $self->set_info unless ($self->{label});          $self->_set_info unless ($self->{label});
1213          return $self->{label};          return $self->{label};
1214  }  }
1215    
# Line 1121  sub label { Line 1222  sub label {
1222    
1223  sub doc_num {  sub doc_num {
1224          my $self = shift;          my $self = shift;
1225          $self->set_info if ($self->{dnum} < 0);          $self->_set_info if ($self->{dnum} < 0);
1226          return $self->{dnum};          return $self->{dnum};
1227  }  }
1228    
# Line 1134  sub doc_num { Line 1235  sub doc_num {
1235    
1236  sub word_num {  sub word_num {
1237          my $self = shift;          my $self = shift;
1238          $self->set_info if ($self->{wnum} < 0);          $self->_set_info if ($self->{wnum} < 0);
1239          return $self->{wnum};          return $self->{wnum};
1240  }  }
1241    
# Line 1147  sub word_num { Line 1248  sub word_num {
1248    
1249  sub size {  sub size {
1250          my $self = shift;          my $self = shift;
1251          $self->set_info if ($self->{size} < 0);          $self->_set_info if ($self->{size} < 0);
1252          return $self->{size};          return $self->{size};
1253  }  }
1254    
# Line 1176  sub search { Line 1277  sub search {
1277    
1278          my $rv = $self->shuttle_url( $self->{url} . '/search',          my $rv = $self->shuttle_url( $self->{url} . '/search',
1279                  'application/x-www-form-urlencoded',                  'application/x-www-form-urlencoded',
1280                  $self->cond_to_query( $cond ),                  $self->cond_to_query( $cond, $depth ),
1281                  \$resbody,                  \$resbody,
1282          );          );
1283          return if ($rv != 200);          return if ($rv != 200);
# Line 1268  sub search { Line 1369  sub search {
1369    
1370  =head2 cond_to_query  =head2 cond_to_query
1371    
1372    my $args = $node->cond_to_query( $cond );  Return URI encoded string generated from Search::Estraier::Condition
1373    
1374      my $args = $node->cond_to_query( $cond, $depth );
1375    
1376  =cut  =cut
1377    
# Line 1277  sub cond_to_query { Line 1380  sub cond_to_query {
1380    
1381          my $cond = shift || return;          my $cond = shift || return;
1382          croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));          croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1383            my $depth = shift;
1384    
1385          my @args;          my @args;
1386    
# Line 1286  sub cond_to_query { Line 1390  sub cond_to_query {
1390    
1391          if (my @attrs = $cond->attrs) {          if (my @attrs = $cond->attrs) {
1392                  for my $i ( 0 .. $#attrs ) {                  for my $i ( 0 .. $#attrs ) {
1393                          push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );                          push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1394                  }                  }
1395          }          }
1396    
# Line 1304  sub cond_to_query { Line 1408  sub cond_to_query {
1408                  push @args, 'options=' . $options;                  push @args, 'options=' . $options;
1409          }          }
1410    
1411          push @args, 'depth=' . $self->{depth} if ($self->{depth});          push @args, 'depth=' . $depth if ($depth);
1412          push @args, 'wwidth=' . $self->{wwidth};          push @args, 'wwidth=' . $self->{wwidth};
1413          push @args, 'hwidth=' . $self->{hwidth};          push @args, 'hwidth=' . $self->{hwidth};
1414          push @args, 'awidth=' . $self->{awidth};          push @args, 'awidth=' . $self->{awidth};
# Line 1315  sub cond_to_query { Line 1419  sub cond_to_query {
1419    
1420  =head2 shuttle_url  =head2 shuttle_url
1421    
1422  This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node  This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1423  master.  master.
1424    
1425    my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );    my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
# Line 1325  body will be saved within object. Line 1429  body will be saved within object.
1429    
1430  =cut  =cut
1431    
1432    use LWP::UserAgent;
1433    
1434  sub shuttle_url {  sub shuttle_url {
1435          my $self = shift;          my $self = shift;
1436    
# Line 1343  sub shuttle_url { Line 1449  sub shuttle_url {
1449                  return -1;                  return -1;
1450          }          }
1451    
1452          my ($host,$port,$query) = ($url->host, $url->port, $url->path);          my $ua = LWP::UserAgent->new;
1453            $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1454    
1455          if ($self->{pxhost}) {          my $req;
1456                  ($host,$port) = ($self->{pxhost}, $self->{pxport});          if ($reqbody) {
1457                  $query = "http://$host:$port/$query";                  $req = HTTP::Request->new(POST => $url);
1458            } else {
1459                    $req = HTTP::Request->new(GET => $url);
1460          }          }
1461    
1462          $query .= '?' . $url->query if ($url->query && ! $reqbody);          $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1463            $req->headers->header( 'Connection', 'close' );
1464            $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1465            $req->content_type( $content_type );
1466    
1467          my $headers;          warn $req->headers->as_string,"\n" if ($self->{debug});
1468    
1469          if ($reqbody) {          if ($reqbody) {
1470                  $headers .= "POST $query HTTP/1.0\r\n";                  warn "$reqbody\n" if ($self->{debug});
1471          } else {                  $req->content( $reqbody );
                 $headers .= "GET $query HTTP/1.0\r\n";  
1472          }          }
1473    
1474          $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n";          my $res = $ua->request($req) || croak "can't make request to $url: $!";
         $headers .= "Connection: close\r\n";  
         $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";  
         $headers .= "Content-Type: $content_type\r\n";  
         $headers .= "Authorization: Basic $self->{auth}\r\n";  
         my $len = 0;  
         {  
                 use bytes;  
                 $len = length($reqbody) if ($reqbody);  
         }  
         $headers .= "Content-Length: $len\r\n";  
         $headers .= "\r\n";  
   
         my $sock = IO::Socket::INET->new(  
                 PeerAddr        => $host,  
                 PeerPort        => $port,  
                 Proto           => 'tcp',  
                 Timeout         => $self->{timeout} || 90,  
         );  
1475    
1476          if (! $sock) {          warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
                 carp "can't open socket to $host:$port";  
                 return -1;  
         }  
1477    
1478          warn $headers if ($self->{debug});          ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1479    
1480          print $sock $headers or          if (! $res->is_success) {
1481                  carp "can't send headers to network:\n$headers\n" and return -1;                  if ($self->{croak_on_error}) {
1482                            croak("can't get $url: ",$res->status_line);
1483          if ($reqbody) {                  } else {
1484                  warn "$reqbody\n" if ($self->{debug});                          return -1;
1485                  print $sock $reqbody or                  }
                         carp "can't send request body to network:\n$$reqbody\n" and return -1;  
1486          }          }
1487    
1488          my $line = <$sock>;          $$resbody .= $res->content;
         chomp($line);  
         my ($schema, $res_status, undef) = split(/  */, $line, 3);  
         return if ($schema !~ /^HTTP/ || ! $res_status);  
   
         $self->{status} = $res_status;  
         warn "## response status: $res_status\n" if ($self->{debug});  
   
         # skip rest of headers  
         $line = <$sock>;  
         while ($line) {  
                 $line = <$sock>;  
                 $line =~ s/[\r\n]+$//;  
                 warn "## ", $line || 'NULL', " ##\n" if ($self->{debug});  
         };  
   
         # read body  
         $len = 0;  
         do {  
                 $len = read($sock, my $buf, 8192);  
                 $$resbody .= $buf if ($resbody);  
         } while ($len);  
1489    
1490          warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});          warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1491    
# Line 1425  sub shuttle_url { Line 1493  sub shuttle_url {
1493  }  }
1494    
1495    
1496  =head2 set_info  =head2 set_snippet_width
1497    
1498    Set width of snippets in results
1499    
1500      $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1501    
1502    C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1503    is not sent with results. If it is negative, whole document text is sent instead of snippet.
1504    
1505    C<$hwidth> specified width of strings from beginning of string. Default
1506    value is C<96>. Negative or zero value keep previous value.
1507    
1508    C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1509    If negative of zero value is provided previous value is kept unchanged.
1510    
1511    =cut
1512    
1513    sub set_snippet_width {
1514            my $self = shift;
1515    
1516            my ($wwidth, $hwidth, $awidth) = @_;
1517            $self->{wwidth} = $wwidth;
1518            $self->{hwidth} = $hwidth if ($hwidth >= 0);
1519            $self->{awidth} = $awidth if ($awidth >= 0);
1520    }
1521    
1522    
1523    =head2 set_user
1524    
1525    Manage users of node
1526    
1527      $node->set_user( 'name', $mode );
1528    
1529    C<$mode> can be one of:
1530    
1531    =over 4
1532    
1533    =item 0
1534    
1535    delete account
1536    
1537    =item 1
1538    
1539    set administrative right for user
1540    
1541    =item 2
1542    
1543    set user account as guest
1544    
1545    =back
1546    
1547    Return true on success, otherwise false.
1548    
1549    =cut
1550    
1551    sub set_user {
1552            my $self = shift;
1553            my ($name, $mode) = @_;
1554    
1555            return unless ($self->{url});
1556            croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1557    
1558            $self->shuttle_url( $self->{url} . '/_set_user',
1559                    'text/plain',
1560                    'name=' . uri_escape($name) . '&mode=' . $mode,
1561                    undef
1562            ) == 200;
1563    }
1564    
1565    
1566    =head2 set_link
1567    
1568    Manage node links
1569    
1570      $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1571    
1572    If C<$credit> is negative, link is removed.
1573    
1574    =cut
1575    
1576    sub set_link {
1577            my $self = shift;
1578            my ($url, $label, $credit) = @_;
1579    
1580            return unless ($self->{url});
1581            croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1582    
1583            my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1584            $reqbody .= '&credit=' . $credit if ($credit > 0);
1585    
1586            $self->shuttle_url( $self->{url} . '/_set_link',
1587                    'application/x-www-form-urlencoded',
1588                    $reqbody,
1589                    undef
1590            ) == 200;
1591    }
1592    
1593    
1594    =head1 PRIVATE METHODS
1595    
1596    You could call those directly, but you don't have to. I hope.
1597    
1598    =head2 _set_info
1599    
1600  Set information for node  Set information for node
1601    
1602    $node->set_info;    $node->_set_info;
1603    
1604  =cut  =cut
1605    
1606  sub set_info {  sub _set_info {
1607          my $self = shift;          my $self = shift;
1608    
1609          $self->{status} = -1;          $self->{status} = -1;
# Line 1448  sub set_info { Line 1618  sub set_info {
1618    
1619          return if ($rv != 200 || !$resbody);          return if ($rv != 200 || !$resbody);
1620    
1621          chomp($resbody);          # it seems that response can have multiple line endings
1622            $resbody =~ s/[\r\n]+$//;
1623    
1624          ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =          ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1625                  split(/\t/, $resbody, 5);                  split(/\t/, $resbody, 5);

Legend:
Removed from v.53  
changed lines
  Added in v.97

  ViewVC Help
Powered by ViewVC 1.1.26