4 |
use strict; |
use strict; |
5 |
use warnings; |
use warnings; |
6 |
|
|
7 |
our $VERSION = '0.01'; |
our $VERSION = '0.04_1'; |
8 |
|
|
9 |
=head1 NAME |
=head1 NAME |
10 |
|
|
12 |
|
|
13 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
14 |
|
|
15 |
use Search::Estraier; |
=head2 Simple indexer |
16 |
my $est = new Search::Estraier(); |
|
17 |
|
use Search::Estraier; |
18 |
|
|
19 |
|
# create and configure node |
20 |
|
my $node = new Search::Estraier::Node; |
21 |
|
$node->set_url("http://localhost:1978/node/test"); |
22 |
|
$node->set_auth("admin","admin"); |
23 |
|
|
24 |
|
# create document |
25 |
|
my $doc = new Search::Estraier::Document; |
26 |
|
|
27 |
|
# add attributes |
28 |
|
$doc->add_attr('@uri', "http://estraier.gov/example.txt"); |
29 |
|
$doc->add_attr('@title', "Over the Rainbow"); |
30 |
|
|
31 |
|
# add body text to document |
32 |
|
$doc->add_text("Somewhere over the rainbow. Way up high."); |
33 |
|
$doc->add_text("There's a land that I heard of once in a lullaby."); |
34 |
|
|
35 |
|
die "error: ", $node->status,"\n" unless ($node->put_doc($doc)); |
36 |
|
|
37 |
|
=head2 Simple searcher |
38 |
|
|
39 |
|
use Search::Estraier; |
40 |
|
|
41 |
|
# create and configure node |
42 |
|
my $node = new Search::Estraier::Node; |
43 |
|
$node->set_url("http://localhost:1978/node/test"); |
44 |
|
$node->set_auth("admin","admin"); |
45 |
|
|
46 |
|
# create condition |
47 |
|
my $cond = new Search::Estraier::Condition; |
48 |
|
|
49 |
|
# set search phrase |
50 |
|
$cond->set_phrase("rainbow AND lullaby"); |
51 |
|
|
52 |
|
my $nres = $node->search($cond, 0); |
53 |
|
print "Got ", $nres->hits, " results\n"; |
54 |
|
|
55 |
|
if (defined($nres)) { |
56 |
|
# for each document in results |
57 |
|
for my $i ( 0 ... $nres->doc_num - 1 ) { |
58 |
|
# get result document |
59 |
|
my $rdoc = $nres->get_doc($i); |
60 |
|
# display attribte |
61 |
|
print "URI: ", $rdoc->attr('@uri'),"\n"; |
62 |
|
print "Title: ", $rdoc->attr('@title'),"\n"; |
63 |
|
print $rdoc->snippet,"\n"; |
64 |
|
} |
65 |
|
} else { |
66 |
|
die "error: ", $node->status,"\n"; |
67 |
|
} |
68 |
|
|
69 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
70 |
|
|
76 |
It is implemented as multiple packages which closly resamble Ruby |
It is implemented as multiple packages which closly resamble Ruby |
77 |
implementation. It also includes methods to manage nodes. |
implementation. It also includes methods to manage nodes. |
78 |
|
|
79 |
|
There are few examples in C<scripts> directory of this distribution. |
80 |
|
|
81 |
=cut |
=cut |
82 |
|
|
83 |
=head1 Inheritable common methods |
=head1 Inheritable common methods |
94 |
=cut |
=cut |
95 |
|
|
96 |
sub _s { |
sub _s { |
97 |
my $text = $_[1] || return; |
my $text = $_[1]; |
98 |
|
return unless defined($text); |
99 |
$text =~ s/\s\s+/ /gs; |
$text =~ s/\s\s+/ /gs; |
100 |
$text =~ s/^\s+//; |
$text =~ s/^\s+//; |
101 |
$text =~ s/\s+$//; |
$text =~ s/\s+$//; |
160 |
} elsif ($line =~ m/^$/) { |
} elsif ($line =~ m/^$/) { |
161 |
$in_text = 1; |
$in_text = 1; |
162 |
next; |
next; |
163 |
} elsif ($line =~ m/^(.+)=(.+)$/) { |
} elsif ($line =~ m/^(.+)=(.*)$/) { |
164 |
$self->{attrs}->{ $1 } = $2; |
$self->{attrs}->{ $1 } = $2; |
165 |
next; |
next; |
166 |
} |
} |
167 |
|
|
168 |
warn "draft ignored: $line\n"; |
warn "draft ignored: '$line'\n"; |
169 |
} |
} |
170 |
} |
} |
171 |
|
|
323 |
my $draft; |
my $draft; |
324 |
|
|
325 |
foreach my $attr_name (sort keys %{ $self->{attrs} }) { |
foreach my $attr_name (sort keys %{ $self->{attrs} }) { |
326 |
$draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n"; |
next unless defined(my $v = $self->{attrs}->{$attr_name}); |
327 |
|
$draft .= $attr_name . '=' . $v . "\n"; |
328 |
} |
} |
329 |
|
|
330 |
if ($self->{kwords}) { |
if ($self->{kwords}) { |
372 |
|
|
373 |
package Search::Estraier::Condition; |
package Search::Estraier::Condition; |
374 |
|
|
375 |
use Carp qw/confess croak/; |
use Carp qw/carp confess croak/; |
376 |
|
|
377 |
use Search::Estraier; |
use Search::Estraier; |
378 |
our @ISA = qw/Search::Estraier/; |
our @ISA = qw/Search::Estraier/; |
450 |
|
|
451 |
=head2 set_options |
=head2 set_options |
452 |
|
|
453 |
$cond->set_options( SURE => 1 ); |
$cond->set_options( 'SURE' ); |
454 |
|
|
455 |
|
$cond->set_options( qw/AGITO NOIDF SIMPLE/ ); |
456 |
|
|
457 |
|
Possible options are: |
458 |
|
|
459 |
|
=over 8 |
460 |
|
|
461 |
|
=item SURE |
462 |
|
|
463 |
|
check every N-gram |
464 |
|
|
465 |
|
=item USUAL |
466 |
|
|
467 |
|
check every second N-gram |
468 |
|
|
469 |
|
=item FAST |
470 |
|
|
471 |
|
check every third N-gram |
472 |
|
|
473 |
|
=item AGITO |
474 |
|
|
475 |
|
check every fourth N-gram |
476 |
|
|
477 |
|
=item NOIDF |
478 |
|
|
479 |
|
don't perform TF-IDF tuning |
480 |
|
|
481 |
|
=item SIMPLE |
482 |
|
|
483 |
|
use simplified query phrase |
484 |
|
|
485 |
|
=back |
486 |
|
|
487 |
|
Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous |
488 |
|
options; |
489 |
|
|
490 |
|
This option changed in version C<0.04> of this module. It's backwards compatibile. |
491 |
|
|
492 |
=cut |
=cut |
493 |
|
|
494 |
my $options = { |
my $options = { |
|
# check N-gram keys skipping by three |
|
495 |
SURE => 1 << 0, |
SURE => 1 << 0, |
|
# check N-gram keys skipping by two |
|
496 |
USUAL => 1 << 1, |
USUAL => 1 << 1, |
|
# without TF-IDF tuning |
|
497 |
FAST => 1 << 2, |
FAST => 1 << 2, |
|
# with the simplified phrase |
|
498 |
AGITO => 1 << 3, |
AGITO => 1 << 3, |
|
# check every N-gram key |
|
499 |
NOIDF => 1 << 4, |
NOIDF => 1 << 4, |
|
# check N-gram keys skipping by one |
|
500 |
SIMPLE => 1 << 10, |
SIMPLE => 1 << 10, |
501 |
}; |
}; |
502 |
|
|
503 |
sub set_options { |
sub set_options { |
504 |
my $self = shift; |
my $self = shift; |
505 |
my $option = shift; |
my $opt = 0; |
506 |
confess "unknown option" unless ($options->{$option}); |
foreach my $option (@_) { |
507 |
$self->{options} ||= $options->{$option}; |
my $mask; |
508 |
|
unless ($mask = $options->{$option}) { |
509 |
|
if ($option eq '1') { |
510 |
|
next; |
511 |
|
} else { |
512 |
|
croak "unknown option $option"; |
513 |
|
} |
514 |
|
} |
515 |
|
$opt += $mask; |
516 |
|
} |
517 |
|
$self->{options} = $opt; |
518 |
} |
} |
519 |
|
|
520 |
|
|
736 |
|
|
737 |
print $res->doc_num; |
print $res->doc_num; |
738 |
|
|
739 |
|
This will return real number of documents (limited by C<max>). |
740 |
|
If you want to get total number of hits, see C<hits>. |
741 |
|
|
742 |
=cut |
=cut |
743 |
|
|
744 |
sub doc_num { |
sub doc_num { |
770 |
|
|
771 |
Return specific hint from results. |
Return specific hint from results. |
772 |
|
|
773 |
print $rec->hint( 'VERSION' ); |
print $res->hint( 'VERSION' ); |
774 |
|
|
775 |
Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>, |
Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>, |
776 |
C<TIME>, C<LINK#n>, C<VIEW>. |
C<TIME>, C<LINK#n>, C<VIEW>. |
783 |
return $self->{hints}->{$key}; |
return $self->{hints}->{$key}; |
784 |
} |
} |
785 |
|
|
786 |
|
=head2 hits |
787 |
|
|
788 |
|
More perlish version of C<hint>. This one returns hash. |
789 |
|
|
790 |
|
my %hints = $res->hints; |
791 |
|
|
792 |
|
=cut |
793 |
|
|
794 |
|
sub hints { |
795 |
|
my $self = shift; |
796 |
|
return $self->{hints}; |
797 |
|
} |
798 |
|
|
799 |
|
=head2 hits |
800 |
|
|
801 |
|
Syntaxtic sugar for total number of hits for this query |
802 |
|
|
803 |
|
print $res->hits; |
804 |
|
|
805 |
|
It's same as |
806 |
|
|
807 |
|
print $res->hint('HIT'); |
808 |
|
|
809 |
|
but shorter. |
810 |
|
|
811 |
|
=cut |
812 |
|
|
813 |
|
sub hits { |
814 |
|
my $self = shift; |
815 |
|
return $self->{hints}->{'HIT'} || 0; |
816 |
|
} |
817 |
|
|
818 |
package Search::Estraier::Node; |
package Search::Estraier::Node; |
819 |
|
|
829 |
|
|
830 |
my $node = new Search::HyperEstraier::Node; |
my $node = new Search::HyperEstraier::Node; |
831 |
|
|
832 |
|
or optionally with C<url> as parametar |
833 |
|
|
834 |
|
my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' ); |
835 |
|
|
836 |
|
or in more verbose form |
837 |
|
|
838 |
|
my $node = new Search::HyperEstraier::Node( |
839 |
|
url => 'http://localhost:1978/node/test', |
840 |
|
debug => 1, |
841 |
|
croak_on_error => 1 |
842 |
|
); |
843 |
|
|
844 |
|
with following arguments: |
845 |
|
|
846 |
|
=over 4 |
847 |
|
|
848 |
|
=item url |
849 |
|
|
850 |
|
URL to node |
851 |
|
|
852 |
|
=item debug |
853 |
|
|
854 |
|
dumps a B<lot> of debugging output |
855 |
|
|
856 |
|
=item croak_on_error |
857 |
|
|
858 |
|
very helpful during development. It will croak on all errors instead of |
859 |
|
silently returning C<-1> (which is convention of Hyper Estraier API in other |
860 |
|
languages). |
861 |
|
|
862 |
|
=back |
863 |
|
|
864 |
=cut |
=cut |
865 |
|
|
866 |
sub new { |
sub new { |
878 |
}; |
}; |
879 |
bless($self, $class); |
bless($self, $class); |
880 |
|
|
881 |
my $args = {@_}; |
if ($#_ == 0) { |
882 |
|
$self->{url} = shift; |
883 |
|
} else { |
884 |
|
my $args = {@_}; |
885 |
|
|
886 |
|
%$self = ( %$self, @_ ); |
887 |
|
|
888 |
$self->{debug} = $args->{debug}; |
warn "## Node debug on\n" if ($self->{debug}); |
889 |
warn "## Node debug on\n" if ($self->{debug}); |
} |
890 |
|
|
891 |
$self ? return $self : return undef; |
$self ? return $self : return undef; |
892 |
} |
} |
1455 |
|
|
1456 |
if (my @attrs = $cond->attrs) { |
if (my @attrs = $cond->attrs) { |
1457 |
for my $i ( 0 .. $#attrs ) { |
for my $i ( 0 .. $#attrs ) { |
1458 |
push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ); |
push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]); |
1459 |
} |
} |
1460 |
} |
} |
1461 |
|
|
1484 |
|
|
1485 |
=head2 shuttle_url |
=head2 shuttle_url |
1486 |
|
|
1487 |
This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node |
This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node |
1488 |
master. |
master. |
1489 |
|
|
1490 |
my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody ); |
my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody ); |
1526 |
|
|
1527 |
$req->headers->header( 'Host' => $url->host . ":" . $url->port ); |
$req->headers->header( 'Host' => $url->host . ":" . $url->port ); |
1528 |
$req->headers->header( 'Connection', 'close' ); |
$req->headers->header( 'Connection', 'close' ); |
1529 |
$req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ); |
$req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth}); |
1530 |
$req->content_type( $content_type ); |
$req->content_type( $content_type ); |
1531 |
|
|
1532 |
warn $req->headers->as_string,"\n" if ($self->{debug}); |
warn $req->headers->as_string,"\n" if ($self->{debug}); |
1540 |
|
|
1541 |
warn "## response status: ",$res->status_line,"\n" if ($self->{debug}); |
warn "## response status: ",$res->status_line,"\n" if ($self->{debug}); |
1542 |
|
|
|
return -1 if (! $res->is_success); |
|
|
|
|
1543 |
($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2); |
($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2); |
1544 |
|
|
1545 |
|
if (! $res->is_success) { |
1546 |
|
if ($self->{croak_on_error}) { |
1547 |
|
croak("can't get $url: ",$res->status_line); |
1548 |
|
} else { |
1549 |
|
return -1; |
1550 |
|
} |
1551 |
|
} |
1552 |
|
|
1553 |
$$resbody .= $res->content; |
$$resbody .= $res->content; |
1554 |
|
|
1555 |
warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug}); |
warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug}); |
1649 |
$reqbody .= '&credit=' . $credit if ($credit > 0); |
$reqbody .= '&credit=' . $credit if ($credit > 0); |
1650 |
|
|
1651 |
$self->shuttle_url( $self->{url} . '/_set_link', |
$self->shuttle_url( $self->{url} . '/_set_link', |
1652 |
'text/plain', |
'application/x-www-form-urlencoded', |
1653 |
$reqbody, |
$reqbody, |
1654 |
undef |
undef |
1655 |
) == 200; |
) == 200; |