/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 184 - (hide annotations)
Sat Nov 4 13:10:29 2006 UTC (17 years, 5 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 38084 byte(s)
set_distinct and set_score patch from Mikio Hirabayashi <mikio@users.sourceforge.net>
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 184 our $VERSION = '0.08_1';
8 dpavlin 2
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15 dpavlin 68 =head2 Simple indexer
16 dpavlin 2
17 dpavlin 68 use Search::Estraier;
18    
19     # create and configure node
20 dpavlin 102 my $node = new Search::Estraier::Node(
21     url => 'http://localhost:1978/node/test',
22     user => 'admin',
23 dpavlin 139 passwd => 'admin',
24     create => 1,
25     label => 'Label for node',
26     croak_on_error => 1,
27 dpavlin 102 );
28 dpavlin 68
29     # create document
30     my $doc = new Search::Estraier::Document;
31    
32     # add attributes
33     $doc->add_attr('@uri', "http://estraier.gov/example.txt");
34     $doc->add_attr('@title', "Over the Rainbow");
35    
36     # add body text to document
37     $doc->add_text("Somewhere over the rainbow. Way up high.");
38     $doc->add_text("There's a land that I heard of once in a lullaby.");
39    
40 dpavlin 102 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
41 dpavlin 68
42     =head2 Simple searcher
43    
44     use Search::Estraier;
45    
46     # create and configure node
47 dpavlin 102 my $node = new Search::Estraier::Node(
48     url => 'http://localhost:1978/node/test',
49     user => 'admin',
50     passwd => 'admin',
51     croak_on_error => 1,
52     );
53 dpavlin 68
54     # create condition
55     my $cond = new Search::Estraier::Condition;
56    
57     # set search phrase
58     $cond->set_phrase("rainbow AND lullaby");
59    
60     my $nres = $node->search($cond, 0);
61 dpavlin 100
62 dpavlin 68 if (defined($nres)) {
63 dpavlin 102 print "Got ", $nres->hits, " results\n";
64    
65 dpavlin 68 # for each document in results
66     for my $i ( 0 ... $nres->doc_num - 1 ) {
67     # get result document
68     my $rdoc = $nres->get_doc($i);
69     # display attribte
70     print "URI: ", $rdoc->attr('@uri'),"\n";
71     print "Title: ", $rdoc->attr('@title'),"\n";
72     print $rdoc->snippet,"\n";
73     }
74     } else {
75     die "error: ", $node->status,"\n";
76     }
77    
78 dpavlin 2 =head1 DESCRIPTION
79    
80     This module is implementation of node API of Hyper Estraier. Since it's
81     perl-only module with dependencies only on standard perl modules, it will
82     run on all platforms on which perl runs. It doesn't require compilation
83     or Hyper Estraier development files on target machine.
84    
85     It is implemented as multiple packages which closly resamble Ruby
86     implementation. It also includes methods to manage nodes.
87    
88 dpavlin 68 There are few examples in C<scripts> directory of this distribution.
89    
90 dpavlin 2 =cut
91    
92 dpavlin 42 =head1 Inheritable common methods
93    
94     This methods should really move somewhere else.
95    
96 dpavlin 15 =head2 _s
97    
98     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
99    
100     my $text = $self->_s(" this is a text ");
101     $text = 'this is a text';
102    
103     =cut
104    
105     sub _s {
106 dpavlin 96 my $text = $_[1];
107     return unless defined($text);
108 dpavlin 15 $text =~ s/\s\s+/ /gs;
109     $text =~ s/^\s+//;
110     $text =~ s/\s+$//;
111     return $text;
112     }
113    
114 dpavlin 2 package Search::Estraier::Document;
115    
116 dpavlin 9 use Carp qw/croak confess/;
117 dpavlin 7
118 dpavlin 15 use Search::Estraier;
119     our @ISA = qw/Search::Estraier/;
120    
121 dpavlin 2 =head1 Search::Estraier::Document
122    
123 dpavlin 164 This class implements Document which is single item in Hyper Estraier.
124 dpavlin 14
125 dpavlin 164 It's is collection of:
126 dpavlin 42
127 dpavlin 164 =over 4
128    
129     =item attributes
130    
131     C<< 'key' => 'value' >> pairs which can later be used for filtering of results
132    
133     You can add common filters to C<attrindex> in estmaster's C<_conf>
134     file for better performance. See C<attrindex> in
135     L<Hyper Estraier P2P Guide|http://hyperestraier.sourceforge.net/nguide-en.html>.
136    
137     =item vectors
138    
139     also C<< 'key' => 'value' >> pairs
140    
141     =item display text
142    
143     Text which will be used to create searchable corpus of your index and
144     included in snippet output.
145    
146     =item hidden text
147    
148     Text which will be searchable, but will not be included in snippet.
149    
150     =back
151    
152 dpavlin 2 =head2 new
153    
154 dpavlin 14 Create new document, empty or from draft.
155    
156 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
157 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
158 dpavlin 2
159     =cut
160    
161     sub new {
162     my $class = shift;
163 dpavlin 14 my $self = {};
164 dpavlin 2 bless($self, $class);
165    
166 dpavlin 6 $self->{id} = -1;
167    
168 dpavlin 14 my $draft = shift;
169    
170     if ($draft) {
171     my $in_text = 0;
172     foreach my $line (split(/\n/, $draft)) {
173    
174     if ($in_text) {
175     if ($line =~ /^\t/) {
176     push @{ $self->{htexts} }, substr($line, 1);
177     } else {
178     push @{ $self->{dtexts} }, $line;
179     }
180     next;
181     }
182    
183     if ($line =~ m/^%VECTOR\t(.+)$/) {
184     my @fields = split(/\t/, $1);
185 dpavlin 166 if ($#fields % 2 == 1) {
186     $self->{kwords} = { @fields };
187     } else {
188     warn "can't decode $line\n";
189 dpavlin 14 }
190     next;
191 dpavlin 184 } elsif ($line =~ m/^%SCORE\t(.+)$/) {
192     $self->{score} = $1;
193     next;
194 dpavlin 14 } elsif ($line =~ m/^%/) {
195     # What is this? comment?
196     #warn "$line\n";
197     next;
198     } elsif ($line =~ m/^$/) {
199     $in_text = 1;
200     next;
201 dpavlin 81 } elsif ($line =~ m/^(.+)=(.*)$/) {
202 dpavlin 14 $self->{attrs}->{ $1 } = $2;
203     next;
204     }
205    
206 dpavlin 81 warn "draft ignored: '$line'\n";
207 dpavlin 14 }
208     }
209    
210 dpavlin 2 $self ? return $self : return undef;
211     }
212    
213 dpavlin 4
214 dpavlin 2 =head2 add_attr
215    
216 dpavlin 6 Add an attribute.
217    
218 dpavlin 2 $doc->add_attr( name => 'value' );
219    
220 dpavlin 9 Delete attribute using
221 dpavlin 5
222     $doc->add_attr( name => undef );
223    
224 dpavlin 2 =cut
225    
226     sub add_attr {
227     my $self = shift;
228     my $attrs = {@_};
229    
230     while (my ($name, $value) = each %{ $attrs }) {
231 dpavlin 9 if (! defined($value)) {
232 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
233 dpavlin 9 } else {
234 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
235 dpavlin 9 }
236 dpavlin 2 }
237 dpavlin 8
238     return 1;
239 dpavlin 2 }
240    
241 dpavlin 5
242     =head2 add_text
243    
244 dpavlin 6 Add a sentence of text.
245    
246 dpavlin 5 $doc->add_text('this is example text to display');
247    
248     =cut
249    
250     sub add_text {
251     my $self = shift;
252     my $text = shift;
253     return unless defined($text);
254    
255 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
256 dpavlin 5 }
257    
258    
259     =head2 add_hidden_text
260    
261 dpavlin 6 Add a hidden sentence.
262    
263 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
264    
265     =cut
266    
267     sub add_hidden_text {
268     my $self = shift;
269     my $text = shift;
270     return unless defined($text);
271    
272 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
273 dpavlin 5 }
274    
275 dpavlin 166 =head2 add_vectors
276 dpavlin 42
277 dpavlin 166 Add a vectors
278    
279     $doc->add_vector(
280     'vector_name' => 42,
281     'another' => 12345,
282     );
283    
284     =cut
285    
286     sub add_vectors {
287     my $self = shift;
288     return unless (@_);
289    
290     # this is ugly, but works
291     die "add_vector needs HASH as argument" unless ($#_ % 2 == 1);
292    
293     $self->{kwords} = {@_};
294     }
295    
296 dpavlin 184 =head2 set_score
297 dpavlin 166
298 dpavlin 184 Set the substitute score
299    
300     $doc->set_score(12345);
301    
302     =cut
303    
304     sub set_score {
305     my $self = shift;
306     my $score = shift;
307     return unless (defined($score));
308     $self->{score} = $score;
309     }
310    
311     =head2 score
312    
313     Get the substitute score
314    
315     =cut
316    
317     sub score {
318     my $self = shift;
319     return -1 unless (defined($self->{score}));
320     return $self->{score};
321     }
322    
323 dpavlin 6 =head2 id
324    
325     Get the ID number of document. If the object has never been registred, C<-1> is returned.
326    
327     print $doc->id;
328    
329     =cut
330    
331     sub id {
332     my $self = shift;
333     return $self->{id};
334     }
335    
336 dpavlin 42
337 dpavlin 7 =head2 attr_names
338    
339 dpavlin 9 Returns array with attribute names from document object.
340 dpavlin 7
341     my @attrs = $doc->attr_names;
342    
343     =cut
344    
345     sub attr_names {
346     my $self = shift;
347 dpavlin 60 return unless ($self->{attrs});
348     #croak "attr_names return array, not scalar" if (! wantarray);
349 dpavlin 7 return sort keys %{ $self->{attrs} };
350     }
351    
352 dpavlin 8
353     =head2 attr
354    
355 dpavlin 9 Returns value of an attribute.
356 dpavlin 8
357     my $value = $doc->attr( 'attribute' );
358    
359     =cut
360    
361     sub attr {
362     my $self = shift;
363     my $name = shift;
364 dpavlin 60 return unless (defined($name) && $self->{attrs});
365     return $self->{attrs}->{ $name };
366 dpavlin 8 }
367    
368 dpavlin 9
369     =head2 texts
370    
371     Returns array with text sentences.
372    
373     my @texts = $doc->texts;
374    
375     =cut
376    
377     sub texts {
378     my $self = shift;
379 dpavlin 60 #confess "texts return array, not scalar" if (! wantarray);
380     return @{ $self->{dtexts} } if ($self->{dtexts});
381 dpavlin 9 }
382    
383 dpavlin 42
384 dpavlin 12 =head2 cat_texts
385    
386     Return whole text as single scalar.
387    
388     my $text = $doc->cat_texts;
389    
390     =cut
391    
392     sub cat_texts {
393     my $self = shift;
394 dpavlin 60 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
395 dpavlin 12 }
396    
397 dpavlin 42
398 dpavlin 5 =head2 dump_draft
399    
400 dpavlin 13 Dump draft data from document object.
401    
402 dpavlin 5 print $doc->dump_draft;
403    
404     =cut
405    
406     sub dump_draft {
407 dpavlin 13 my $self = shift;
408     my $draft;
409    
410     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
411 dpavlin 97 next unless defined(my $v = $self->{attrs}->{$attr_name});
412 dpavlin 93 $draft .= $attr_name . '=' . $v . "\n";
413 dpavlin 13 }
414    
415     if ($self->{kwords}) {
416 dpavlin 166 $draft .= '%VECTOR';
417 dpavlin 13 while (my ($key, $value) = each %{ $self->{kwords} }) {
418     $draft .= "\t$key\t$value";
419     }
420     $draft .= "\n";
421     }
422    
423 dpavlin 184 if (defined($self->{score}) && $self->{score} >= 0) {
424     $draft .= "%SCORE\t" . $self->{score} . "\n";
425     }
426    
427 dpavlin 13 $draft .= "\n";
428    
429 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
430     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
431 dpavlin 13
432 dpavlin 184 printf("[%s]\n", $draft);
433    
434 dpavlin 13 return $draft;
435 dpavlin 5 }
436    
437 dpavlin 42
438 dpavlin 4 =head2 delete
439 dpavlin 2
440 dpavlin 4 Empty document object
441 dpavlin 2
442 dpavlin 4 $doc->delete;
443    
444 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
445     convinience. Document objects which go out of scope will be destroyed
446     automatically.
447    
448 dpavlin 4 =cut
449    
450     sub delete {
451     my $self = shift;
452    
453 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
454 dpavlin 5 delete($self->{$data});
455     }
456 dpavlin 4
457 dpavlin 10 $self->{id} = -1;
458    
459 dpavlin 4 return 1;
460     }
461    
462    
463    
464 dpavlin 15 package Search::Estraier::Condition;
465 dpavlin 4
466 dpavlin 98 use Carp qw/carp confess croak/;
467 dpavlin 16
468 dpavlin 15 use Search::Estraier;
469     our @ISA = qw/Search::Estraier/;
470 dpavlin 4
471 dpavlin 16 =head1 Search::Estraier::Condition
472    
473     =head2 new
474    
475     my $cond = new Search::HyperEstraier::Condition;
476    
477     =cut
478    
479     sub new {
480     my $class = shift;
481     my $self = {};
482     bless($self, $class);
483    
484 dpavlin 19 $self->{max} = -1;
485     $self->{options} = 0;
486    
487 dpavlin 16 $self ? return $self : return undef;
488     }
489    
490 dpavlin 42
491 dpavlin 16 =head2 set_phrase
492    
493     $cond->set_phrase('search phrase');
494    
495     =cut
496    
497     sub set_phrase {
498     my $self = shift;
499     $self->{phrase} = $self->_s( shift );
500     }
501    
502 dpavlin 42
503 dpavlin 16 =head2 add_attr
504    
505     $cond->add_attr('@URI STRINC /~dpavlin/');
506    
507     =cut
508    
509     sub add_attr {
510     my $self = shift;
511     my $attr = shift || return;
512     push @{ $self->{attrs} }, $self->_s( $attr );
513     }
514    
515 dpavlin 42
516 dpavlin 16 =head2 set_order
517    
518     $cond->set_order('@mdate NUMD');
519    
520     =cut
521    
522     sub set_order {
523     my $self = shift;
524     $self->{order} = shift;
525     }
526    
527 dpavlin 42
528 dpavlin 16 =head2 set_max
529    
530     $cond->set_max(42);
531    
532     =cut
533    
534     sub set_max {
535     my $self = shift;
536     my $max = shift;
537 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
538 dpavlin 16 $self->{max} = $max;
539     }
540    
541 dpavlin 42
542 dpavlin 16 =head2 set_options
543    
544 dpavlin 98 $cond->set_options( 'SURE' );
545 dpavlin 16
546 dpavlin 98 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
547    
548     Possible options are:
549    
550     =over 8
551    
552     =item SURE
553    
554     check every N-gram
555    
556     =item USUAL
557    
558     check every second N-gram
559    
560     =item FAST
561    
562     check every third N-gram
563    
564     =item AGITO
565    
566     check every fourth N-gram
567    
568     =item NOIDF
569    
570     don't perform TF-IDF tuning
571    
572     =item SIMPLE
573    
574     use simplified query phrase
575    
576     =back
577    
578     Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
579     options;
580    
581     This option changed in version C<0.04> of this module. It's backwards compatibile.
582    
583 dpavlin 16 =cut
584    
585 dpavlin 15 my $options = {
586     SURE => 1 << 0,
587     USUAL => 1 << 1,
588     FAST => 1 << 2,
589     AGITO => 1 << 3,
590     NOIDF => 1 << 4,
591     SIMPLE => 1 << 10,
592     };
593    
594 dpavlin 16 sub set_options {
595     my $self = shift;
596 dpavlin 98 my $opt = 0;
597     foreach my $option (@_) {
598     my $mask;
599     unless ($mask = $options->{$option}) {
600     if ($option eq '1') {
601     next;
602     } else {
603     croak "unknown option $option";
604     }
605     }
606     $opt += $mask;
607     }
608     $self->{options} = $opt;
609 dpavlin 4 }
610    
611 dpavlin 42
612 dpavlin 18 =head2 phrase
613    
614     Return search phrase.
615    
616     print $cond->phrase;
617    
618     =cut
619    
620     sub phrase {
621     my $self = shift;
622     return $self->{phrase};
623     }
624    
625 dpavlin 42
626 dpavlin 19 =head2 order
627 dpavlin 18
628 dpavlin 19 Return search result order.
629    
630     print $cond->order;
631    
632     =cut
633    
634     sub order {
635     my $self = shift;
636     return $self->{order};
637     }
638    
639 dpavlin 42
640 dpavlin 19 =head2 attrs
641    
642     Return search result attrs.
643    
644     my @cond_attrs = $cond->attrs;
645    
646     =cut
647    
648     sub attrs {
649     my $self = shift;
650     #croak "attrs return array, not scalar" if (! wantarray);
651 dpavlin 60 return @{ $self->{attrs} } if ($self->{attrs});
652 dpavlin 19 }
653    
654 dpavlin 42
655 dpavlin 19 =head2 max
656    
657     Return maximum number of results.
658    
659     print $cond->max;
660    
661     C<-1> is returned for unitialized value, C<0> is unlimited.
662    
663     =cut
664    
665     sub max {
666     my $self = shift;
667     return $self->{max};
668     }
669    
670 dpavlin 42
671 dpavlin 19 =head2 options
672    
673     Return options for this condition.
674    
675     print $cond->options;
676    
677     Options are returned in numerical form.
678    
679     =cut
680    
681     sub options {
682     my $self = shift;
683     return $self->{options};
684     }
685    
686    
687 dpavlin 116 =head2 set_skip
688    
689     Set number of skipped documents from beginning of results
690    
691     $cond->set_skip(42);
692    
693     Similar to C<offset> in RDBMS.
694    
695     =cut
696    
697     sub set_skip {
698     my $self = shift;
699     $self->{skip} = shift;
700     }
701    
702     =head2 skip
703    
704     Return skip for this condition.
705    
706     print $cond->skip;
707    
708     =cut
709    
710     sub skip {
711     my $self = shift;
712     return $self->{skip};
713     }
714    
715 dpavlin 184
716     =head2 set_distinct
717    
718     $cond->set_distinct('@author');
719    
720     =cut
721    
722     sub set_distinct {
723     my $self = shift;
724     $self->{distinct} = shift;
725     }
726    
727     =head2 distinct
728    
729     Return distinct attribute
730    
731     print $cond->distinct;
732    
733     =cut
734    
735     sub distinct {
736     my $self = shift;
737     return $self->{distinct};
738     }
739    
740 dpavlin 173 =head2 set_mask
741 dpavlin 116
742 dpavlin 174 Filter out some links when searching.
743 dpavlin 173
744 dpavlin 174 Argument array of link numbers, starting with 0 (current node).
745 dpavlin 173
746     $cond->set_mask(qw/0 1 4/);
747    
748     =cut
749    
750     sub set_mask {
751     my $self = shift;
752     return unless (@_);
753     $self->{mask} = \@_;
754     }
755    
756    
757 dpavlin 20 package Search::Estraier::ResultDocument;
758    
759 dpavlin 24 use Carp qw/croak/;
760 dpavlin 20
761 dpavlin 24 #use Search::Estraier;
762     #our @ISA = qw/Search::Estraier/;
763 dpavlin 20
764     =head1 Search::Estraier::ResultDocument
765    
766     =head2 new
767    
768 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
769 dpavlin 20 uri => 'http://localhost/document/uri/42',
770     attrs => {
771     foo => 1,
772     bar => 2,
773     },
774     snippet => 'this is a text of snippet'
775     keywords => 'this\tare\tkeywords'
776     );
777    
778     =cut
779    
780     sub new {
781     my $class = shift;
782     my $self = {@_};
783     bless($self, $class);
784    
785 dpavlin 62 croak "missing uri for ResultDocument" unless defined($self->{uri});
786 dpavlin 20
787     $self ? return $self : return undef;
788     }
789    
790 dpavlin 42
791 dpavlin 23 =head2 uri
792 dpavlin 20
793 dpavlin 23 Return URI of result document
794 dpavlin 20
795 dpavlin 23 print $rdoc->uri;
796    
797     =cut
798    
799     sub uri {
800     my $self = shift;
801     return $self->{uri};
802     }
803    
804    
805     =head2 attr_names
806    
807     Returns array with attribute names from result document object.
808    
809     my @attrs = $rdoc->attr_names;
810    
811     =cut
812    
813     sub attr_names {
814     my $self = shift;
815     croak "attr_names return array, not scalar" if (! wantarray);
816     return sort keys %{ $self->{attrs} };
817     }
818    
819 dpavlin 42
820 dpavlin 23 =head2 attr
821    
822     Returns value of an attribute.
823    
824     my $value = $rdoc->attr( 'attribute' );
825    
826     =cut
827    
828     sub attr {
829     my $self = shift;
830     my $name = shift || return;
831     return $self->{attrs}->{ $name };
832     }
833    
834 dpavlin 42
835 dpavlin 23 =head2 snippet
836    
837     Return snippet from result document
838    
839     print $rdoc->snippet;
840    
841     =cut
842    
843     sub snippet {
844     my $self = shift;
845     return $self->{snippet};
846     }
847    
848 dpavlin 42
849 dpavlin 23 =head2 keywords
850    
851     Return keywords from result document
852    
853     print $rdoc->keywords;
854    
855     =cut
856    
857     sub keywords {
858     my $self = shift;
859     return $self->{keywords};
860     }
861    
862    
863 dpavlin 25 package Search::Estraier::NodeResult;
864    
865     use Carp qw/croak/;
866    
867     #use Search::Estraier;
868     #our @ISA = qw/Search::Estraier/;
869    
870     =head1 Search::Estraier::NodeResult
871    
872     =head2 new
873    
874     my $res = new Search::HyperEstraier::NodeResult(
875     docs => @array_of_rdocs,
876     hits => %hash_with_hints,
877     );
878    
879     =cut
880    
881     sub new {
882     my $class = shift;
883     my $self = {@_};
884     bless($self, $class);
885    
886     foreach my $f (qw/docs hints/) {
887     croak "missing $f for ResultDocument" unless defined($self->{$f});
888     }
889    
890     $self ? return $self : return undef;
891     }
892    
893 dpavlin 42
894 dpavlin 25 =head2 doc_num
895    
896     Return number of documents
897    
898     print $res->doc_num;
899    
900 dpavlin 100 This will return real number of documents (limited by C<max>).
901     If you want to get total number of hits, see C<hits>.
902    
903 dpavlin 25 =cut
904    
905     sub doc_num {
906     my $self = shift;
907 dpavlin 53 return $#{$self->{docs}} + 1;
908 dpavlin 25 }
909    
910 dpavlin 42
911 dpavlin 25 =head2 get_doc
912    
913     Return single document
914    
915     my $doc = $res->get_doc( 42 );
916    
917     Returns undef if document doesn't exist.
918    
919     =cut
920    
921     sub get_doc {
922     my $self = shift;
923     my $num = shift;
924 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
925 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
926     return $self->{docs}->[$num];
927     }
928    
929 dpavlin 42
930 dpavlin 25 =head2 hint
931    
932     Return specific hint from results.
933    
934 dpavlin 100 print $res->hint( 'VERSION' );
935 dpavlin 25
936     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
937     C<TIME>, C<LINK#n>, C<VIEW>.
938    
939     =cut
940    
941     sub hint {
942     my $self = shift;
943     my $key = shift || return;
944     return $self->{hints}->{$key};
945     }
946    
947 dpavlin 108 =head2 hints
948 dpavlin 25
949 dpavlin 91 More perlish version of C<hint>. This one returns hash.
950    
951 dpavlin 100 my %hints = $res->hints;
952 dpavlin 91
953     =cut
954    
955     sub hints {
956     my $self = shift;
957     return $self->{hints};
958     }
959    
960 dpavlin 100 =head2 hits
961    
962     Syntaxtic sugar for total number of hits for this query
963    
964     print $res->hits;
965    
966     It's same as
967    
968     print $res->hint('HIT');
969    
970     but shorter.
971    
972     =cut
973    
974     sub hits {
975     my $self = shift;
976     return $self->{hints}->{'HIT'} || 0;
977     }
978    
979 dpavlin 27 package Search::Estraier::Node;
980    
981 dpavlin 44 use Carp qw/carp croak confess/;
982 dpavlin 33 use URI;
983 dpavlin 36 use MIME::Base64;
984 dpavlin 33 use IO::Socket::INET;
985 dpavlin 49 use URI::Escape qw/uri_escape/;
986 dpavlin 29
987 dpavlin 27 =head1 Search::Estraier::Node
988    
989     =head2 new
990    
991     my $node = new Search::HyperEstraier::Node;
992    
993 dpavlin 65 or optionally with C<url> as parametar
994    
995     my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
996    
997 dpavlin 78 or in more verbose form
998    
999     my $node = new Search::HyperEstraier::Node(
1000     url => 'http://localhost:1978/node/test',
1001 dpavlin 132 user => 'admin',
1002     passwd => 'admin'
1003 dpavlin 139 create => 1,
1004     label => 'optional node label',
1005 dpavlin 78 debug => 1,
1006     croak_on_error => 1
1007     );
1008    
1009     with following arguments:
1010    
1011     =over 4
1012    
1013     =item url
1014    
1015     URL to node
1016    
1017 dpavlin 132 =item user
1018    
1019     specify username for node server authentication
1020    
1021     =item passwd
1022    
1023     password for authentication
1024    
1025 dpavlin 139 =item create
1026    
1027     create node if it doesn't exists
1028    
1029     =item label
1030    
1031     optional label for new node if C<create> is used
1032    
1033 dpavlin 78 =item debug
1034    
1035     dumps a B<lot> of debugging output
1036    
1037     =item croak_on_error
1038    
1039     very helpful during development. It will croak on all errors instead of
1040     silently returning C<-1> (which is convention of Hyper Estraier API in other
1041     languages).
1042    
1043     =back
1044    
1045 dpavlin 27 =cut
1046    
1047     sub new {
1048     my $class = shift;
1049     my $self = {
1050     pxport => -1,
1051 dpavlin 33 timeout => 0, # this used to be -1
1052 dpavlin 27 wwidth => 480,
1053     hwidth => 96,
1054     awidth => 96,
1055     status => -1,
1056     };
1057 dpavlin 111
1058 dpavlin 27 bless($self, $class);
1059    
1060 dpavlin 65 if ($#_ == 0) {
1061     $self->{url} = shift;
1062     } else {
1063 dpavlin 78 %$self = ( %$self, @_ );
1064    
1065 dpavlin 132 $self->set_auth( $self->{user}, $self->{passwd} ) if ($self->{user});
1066    
1067 dpavlin 65 warn "## Node debug on\n" if ($self->{debug});
1068     }
1069 dpavlin 57
1070 dpavlin 111 $self->{inform} = {
1071     dnum => -1,
1072     wnum => -1,
1073     size => -1.0,
1074     };
1075    
1076 dpavlin 139 if ($self->{create}) {
1077 dpavlin 140 if (! eval { $self->name } || $@) {
1078 dpavlin 139 my $name = $1 if ($self->{url} =~ m#/node/([^/]+)/*#);
1079     croak "can't find node name in '$self->{url}'" unless ($name);
1080     my $label = $self->{label} || $name;
1081     $self->master(
1082     action => 'nodeadd',
1083     name => $name,
1084     label => $label,
1085     ) || croak "can't create node $name ($label)";
1086     }
1087     }
1088    
1089 dpavlin 27 $self ? return $self : return undef;
1090     }
1091    
1092 dpavlin 42
1093 dpavlin 29 =head2 set_url
1094    
1095     Specify URL to node server
1096    
1097     $node->set_url('http://localhost:1978');
1098    
1099     =cut
1100    
1101     sub set_url {
1102     my $self = shift;
1103     $self->{url} = shift;
1104     }
1105    
1106 dpavlin 42
1107 dpavlin 29 =head2 set_proxy
1108    
1109     Specify proxy server to connect to node server
1110    
1111     $node->set_proxy('proxy.example.com', 8080);
1112    
1113     =cut
1114    
1115     sub set_proxy {
1116     my $self = shift;
1117     my ($host,$port) = @_;
1118 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
1119 dpavlin 29 $self->{pxhost} = $host;
1120     $self->{pxport} = $port;
1121     }
1122    
1123 dpavlin 42
1124 dpavlin 30 =head2 set_timeout
1125    
1126     Specify timeout of connection in seconds
1127    
1128     $node->set_timeout( 15 );
1129    
1130     =cut
1131    
1132     sub set_timeout {
1133     my $self = shift;
1134     my $sec = shift;
1135 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
1136 dpavlin 30 $self->{timeout} = $sec;
1137     }
1138    
1139 dpavlin 42
1140 dpavlin 31 =head2 set_auth
1141    
1142     Specify name and password for authentication to node server.
1143    
1144     $node->set_auth('clint','eastwood');
1145    
1146     =cut
1147    
1148     sub set_auth {
1149     my $self = shift;
1150     my ($login,$passwd) = @_;
1151 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
1152     chomp($basic_auth);
1153     $self->{auth} = $basic_auth;
1154 dpavlin 31 }
1155    
1156 dpavlin 42
1157 dpavlin 32 =head2 status
1158    
1159     Return status code of last request.
1160    
1161 dpavlin 40 print $node->status;
1162 dpavlin 32
1163     C<-1> means connection failure.
1164    
1165     =cut
1166    
1167     sub status {
1168     my $self = shift;
1169     return $self->{status};
1170     }
1171    
1172 dpavlin 42
1173 dpavlin 40 =head2 put_doc
1174    
1175 dpavlin 41 Add a document
1176 dpavlin 40
1177 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
1178    
1179 dpavlin 151 Return true on success or false on failure.
1180 dpavlin 41
1181 dpavlin 40 =cut
1182    
1183     sub put_doc {
1184     my $self = shift;
1185     my $doc = shift || return;
1186 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1187 dpavlin 151 if ($self->shuttle_url( $self->{url} . '/put_doc',
1188 dpavlin 41 'text/x-estraier-draft',
1189     $doc->dump_draft,
1190     undef
1191 dpavlin 151 ) == 200) {
1192     $self->_clear_info;
1193     return 1;
1194     }
1195     return undef;
1196 dpavlin 40 }
1197    
1198 dpavlin 41
1199     =head2 out_doc
1200    
1201     Remove a document
1202    
1203     $node->out_doc( document_id ) or "can't remove document";
1204    
1205     Return true on success or false on failture.
1206    
1207     =cut
1208    
1209     sub out_doc {
1210     my $self = shift;
1211     my $id = shift || return;
1212     return unless ($self->{url});
1213 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1214 dpavlin 150 if ($self->shuttle_url( $self->{url} . '/out_doc',
1215 dpavlin 41 'application/x-www-form-urlencoded',
1216     "id=$id",
1217     undef
1218 dpavlin 150 ) == 200) {
1219 dpavlin 151 $self->_clear_info;
1220     return 1;
1221 dpavlin 150 }
1222     return undef;
1223 dpavlin 41 }
1224    
1225    
1226     =head2 out_doc_by_uri
1227    
1228     Remove a registrated document using it's uri
1229    
1230 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1231 dpavlin 41
1232     Return true on success or false on failture.
1233    
1234     =cut
1235    
1236     sub out_doc_by_uri {
1237     my $self = shift;
1238     my $uri = shift || return;
1239     return unless ($self->{url});
1240 dpavlin 150 if ($self->shuttle_url( $self->{url} . '/out_doc',
1241 dpavlin 41 'application/x-www-form-urlencoded',
1242 dpavlin 50 "uri=" . uri_escape($uri),
1243 dpavlin 41 undef
1244 dpavlin 150 ) == 200) {
1245 dpavlin 151 $self->_clear_info;
1246     return 1;
1247 dpavlin 150 }
1248     return undef;
1249 dpavlin 41 }
1250    
1251 dpavlin 42
1252     =head2 edit_doc
1253    
1254     Edit attributes of a document
1255    
1256     $node->edit_doc( $document_draft ) or die "can't edit document";
1257    
1258     Return true on success or false on failture.
1259    
1260     =cut
1261    
1262     sub edit_doc {
1263     my $self = shift;
1264     my $doc = shift || return;
1265 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1266 dpavlin 151 if ($self->shuttle_url( $self->{url} . '/edit_doc',
1267 dpavlin 42 'text/x-estraier-draft',
1268     $doc->dump_draft,
1269     undef
1270 dpavlin 151 ) == 200) {
1271     $self->_clear_info;
1272     return 1;
1273     }
1274     return undef;
1275 dpavlin 42 }
1276    
1277    
1278 dpavlin 43 =head2 get_doc
1279    
1280     Retreive document
1281    
1282     my $doc = $node->get_doc( document_id ) or die "can't get document";
1283    
1284     Return true on success or false on failture.
1285    
1286     =cut
1287    
1288     sub get_doc {
1289     my $self = shift;
1290     my $id = shift || return;
1291     return $self->_fetch_doc( id => $id );
1292     }
1293    
1294 dpavlin 44
1295 dpavlin 43 =head2 get_doc_by_uri
1296    
1297     Retreive document
1298    
1299 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1300 dpavlin 43
1301     Return true on success or false on failture.
1302    
1303     =cut
1304    
1305     sub get_doc_by_uri {
1306     my $self = shift;
1307     my $uri = shift || return;
1308     return $self->_fetch_doc( uri => $uri );
1309     }
1310    
1311 dpavlin 44
1312 dpavlin 49 =head2 get_doc_attr
1313    
1314     Retrieve the value of an atribute from object
1315    
1316     my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1317     die "can't get document attribute";
1318    
1319     =cut
1320    
1321     sub get_doc_attr {
1322     my $self = shift;
1323     my ($id,$name) = @_;
1324     return unless ($id && $name);
1325     return $self->_fetch_doc( id => $id, attr => $name );
1326     }
1327    
1328    
1329     =head2 get_doc_attr_by_uri
1330    
1331     Retrieve the value of an atribute from object
1332    
1333     my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1334     die "can't get document attribute";
1335    
1336     =cut
1337    
1338     sub get_doc_attr_by_uri {
1339     my $self = shift;
1340     my ($uri,$name) = @_;
1341     return unless ($uri && $name);
1342     return $self->_fetch_doc( uri => $uri, attr => $name );
1343     }
1344    
1345    
1346 dpavlin 44 =head2 etch_doc
1347    
1348     Exctract document keywords
1349    
1350     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1351    
1352     =cut
1353    
1354 dpavlin 49 sub etch_doc {
1355 dpavlin 44 my $self = shift;
1356     my $id = shift || return;
1357     return $self->_fetch_doc( id => $id, etch => 1 );
1358     }
1359    
1360     =head2 etch_doc_by_uri
1361    
1362     Retreive document
1363    
1364 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1365 dpavlin 44
1366     Return true on success or false on failture.
1367    
1368     =cut
1369    
1370     sub etch_doc_by_uri {
1371     my $self = shift;
1372     my $uri = shift || return;
1373     return $self->_fetch_doc( uri => $uri, etch => 1 );
1374     }
1375    
1376    
1377 dpavlin 45 =head2 uri_to_id
1378    
1379     Get ID of document specified by URI
1380    
1381     my $id = $node->uri_to_id( 'file:///document/uri/42' );
1382    
1383 dpavlin 103 This method won't croak, even if using C<croak_on_error>.
1384    
1385 dpavlin 45 =cut
1386    
1387     sub uri_to_id {
1388     my $self = shift;
1389     my $uri = shift || return;
1390 dpavlin 103 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1391 dpavlin 45 }
1392    
1393    
1394 dpavlin 43 =head2 _fetch_doc
1395    
1396 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1397     C<etch_doc>, C<etch_doc_by_uri>.
1398 dpavlin 43
1399 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
1400     my $doc = $node->_fetch_doc( id => 42 );
1401     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1402 dpavlin 43
1403 dpavlin 45 # to extract keywords, add etch
1404     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1405     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1406    
1407 dpavlin 49 # to get document attrubute add attr
1408     my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1409     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1410    
1411 dpavlin 45 # more general form which allows implementation of
1412     # uri_to_id
1413     my $id = $node->_fetch_doc(
1414     uri => 'file:///document/uri/42',
1415     path => '/uri_to_id',
1416     chomp_resbody => 1
1417     );
1418    
1419 dpavlin 43 =cut
1420    
1421     sub _fetch_doc {
1422     my $self = shift;
1423 dpavlin 44 my $a = {@_};
1424     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1425    
1426     my ($arg, $resbody);
1427    
1428 dpavlin 45 my $path = $a->{path} || '/get_doc';
1429 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1430    
1431     if ($a->{id}) {
1432 dpavlin 168 croak "id must be number not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1433 dpavlin 44 $arg = 'id=' . $a->{id};
1434     } elsif ($a->{uri}) {
1435 dpavlin 50 $arg = 'uri=' . uri_escape($a->{uri});
1436 dpavlin 44 } else {
1437     confess "unhandled argument. Need id or uri.";
1438 dpavlin 43 }
1439 dpavlin 44
1440 dpavlin 49 if ($a->{attr}) {
1441     $path = '/get_doc_attr';
1442     $arg .= '&attr=' . uri_escape($a->{attr});
1443     $a->{chomp_resbody} = 1;
1444     }
1445    
1446 dpavlin 44 my $rv = $self->shuttle_url( $self->{url} . $path,
1447 dpavlin 43 'application/x-www-form-urlencoded',
1448 dpavlin 44 $arg,
1449 dpavlin 45 \$resbody,
1450 dpavlin 103 $a->{croak_on_error},
1451 dpavlin 43 );
1452 dpavlin 44
1453 dpavlin 43 return if ($rv != 200);
1454 dpavlin 44
1455     if ($a->{etch}) {
1456     $self->{kwords} = {};
1457     return +{} unless ($resbody);
1458     foreach my $l (split(/\n/, $resbody)) {
1459     my ($k,$v) = split(/\t/, $l, 2);
1460     $self->{kwords}->{$k} = $v if ($v);
1461     }
1462     return $self->{kwords};
1463 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1464     return unless (defined($resbody));
1465     chomp($resbody);
1466     return $resbody;
1467 dpavlin 44 } else {
1468     return new Search::Estraier::Document($resbody);
1469     }
1470 dpavlin 43 }
1471    
1472    
1473 dpavlin 48 =head2 name
1474 dpavlin 43
1475 dpavlin 48 my $node_name = $node->name;
1476 dpavlin 43
1477 dpavlin 48 =cut
1478    
1479     sub name {
1480     my $self = shift;
1481 dpavlin 111 $self->_set_info unless ($self->{inform}->{name});
1482     return $self->{inform}->{name};
1483 dpavlin 48 }
1484    
1485    
1486     =head2 label
1487    
1488     my $node_label = $node->label;
1489    
1490     =cut
1491    
1492     sub label {
1493     my $self = shift;
1494 dpavlin 111 $self->_set_info unless ($self->{inform}->{label});
1495     return $self->{inform}->{label};
1496 dpavlin 48 }
1497    
1498    
1499     =head2 doc_num
1500    
1501     my $documents_in_node = $node->doc_num;
1502    
1503     =cut
1504    
1505     sub doc_num {
1506     my $self = shift;
1507 dpavlin 111 $self->_set_info if ($self->{inform}->{dnum} < 0);
1508     return $self->{inform}->{dnum};
1509 dpavlin 48 }
1510    
1511    
1512     =head2 word_num
1513    
1514     my $words_in_node = $node->word_num;
1515    
1516     =cut
1517    
1518     sub word_num {
1519     my $self = shift;
1520 dpavlin 111 $self->_set_info if ($self->{inform}->{wnum} < 0);
1521     return $self->{inform}->{wnum};
1522 dpavlin 48 }
1523    
1524    
1525     =head2 size
1526    
1527     my $node_size = $node->size;
1528    
1529     =cut
1530    
1531     sub size {
1532     my $self = shift;
1533 dpavlin 111 $self->_set_info if ($self->{inform}->{size} < 0);
1534     return $self->{inform}->{size};
1535 dpavlin 48 }
1536    
1537    
1538 dpavlin 51 =head2 search
1539 dpavlin 48
1540 dpavlin 51 Search documents which match condition
1541    
1542     my $nres = $node->search( $cond, $depth );
1543    
1544     C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1545     depth for meta search.
1546    
1547     Function results C<Search::Estraier::NodeResult> object.
1548    
1549     =cut
1550    
1551     sub search {
1552     my $self = shift;
1553     my ($cond, $depth) = @_;
1554     return unless ($cond && defined($depth) && $self->{url});
1555     croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1556     croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1557    
1558 dpavlin 52 my $resbody;
1559 dpavlin 51
1560 dpavlin 52 my $rv = $self->shuttle_url( $self->{url} . '/search',
1561 dpavlin 53 'application/x-www-form-urlencoded',
1562 dpavlin 61 $self->cond_to_query( $cond, $depth ),
1563 dpavlin 52 \$resbody,
1564     );
1565     return if ($rv != 200);
1566    
1567 dpavlin 126 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1568     my $hintsText = splice @records, 0, 2; # starts with empty record
1569     my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1570    
1571     # process records
1572 dpavlin 128 my $docs = [];
1573 dpavlin 126 foreach my $record (@records)
1574     {
1575     # split into keys and snippets
1576     my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1577    
1578     # create document hash
1579     my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1580     $doc->{'@keywords'} = $doc->{keywords};
1581     ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1582     $doc->{snippet} = $snippet;
1583    
1584     push @$docs, new Search::Estraier::ResultDocument(
1585     attrs => $doc,
1586     uri => $doc->{'@uri'},
1587     snippet => $snippet,
1588     keywords => $doc->{'keywords'},
1589     );
1590     }
1591    
1592     return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1593     }
1594    
1595    
1596 dpavlin 51 =head2 cond_to_query
1597    
1598 dpavlin 55 Return URI encoded string generated from Search::Estraier::Condition
1599    
1600 dpavlin 61 my $args = $node->cond_to_query( $cond, $depth );
1601 dpavlin 51
1602     =cut
1603    
1604     sub cond_to_query {
1605     my $self = shift;
1606    
1607     my $cond = shift || return;
1608     croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1609 dpavlin 61 my $depth = shift;
1610 dpavlin 51
1611     my @args;
1612    
1613     if (my $phrase = $cond->phrase) {
1614     push @args, 'phrase=' . uri_escape($phrase);
1615     }
1616    
1617     if (my @attrs = $cond->attrs) {
1618     for my $i ( 0 .. $#attrs ) {
1619 dpavlin 63 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1620 dpavlin 51 }
1621     }
1622    
1623     if (my $order = $cond->order) {
1624     push @args, 'order=' . uri_escape($order);
1625     }
1626    
1627     if (my $max = $cond->max) {
1628     push @args, 'max=' . $max;
1629     } else {
1630     push @args, 'max=' . (1 << 30);
1631     }
1632    
1633     if (my $options = $cond->options) {
1634     push @args, 'options=' . $options;
1635     }
1636    
1637 dpavlin 61 push @args, 'depth=' . $depth if ($depth);
1638 dpavlin 51 push @args, 'wwidth=' . $self->{wwidth};
1639     push @args, 'hwidth=' . $self->{hwidth};
1640     push @args, 'awidth=' . $self->{awidth};
1641 dpavlin 155 push @args, 'skip=' . $cond->{skip} if ($cond->{skip});
1642 dpavlin 51
1643 dpavlin 184 if (my $distinct = $cond->distinct) {
1644     push @args, 'distinct=' . uri_escape($distinct);
1645     }
1646    
1647 dpavlin 173 if ($cond->{mask}) {
1648     my $mask = 0;
1649     map { $mask += ( 2 ** $_ ) } @{ $cond->{mask} };
1650    
1651     push @args, 'mask=' . $mask if ($mask);
1652     }
1653    
1654 dpavlin 51 return join('&', @args);
1655     }
1656    
1657    
1658 dpavlin 33 =head2 shuttle_url
1659 dpavlin 32
1660 dpavlin 68 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1661 dpavlin 33 master.
1662 dpavlin 2
1663 dpavlin 52 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1664 dpavlin 2
1665 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1666     body will be saved within object.
1667 dpavlin 2
1668     =cut
1669    
1670 dpavlin 59 use LWP::UserAgent;
1671    
1672 dpavlin 33 sub shuttle_url {
1673     my $self = shift;
1674 dpavlin 2
1675 dpavlin 103 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1676 dpavlin 2
1677 dpavlin 103 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1678    
1679 dpavlin 40 $self->{status} = -1;
1680 dpavlin 33
1681 dpavlin 41 warn "## $url\n" if ($self->{debug});
1682 dpavlin 36
1683 dpavlin 33 $url = new URI($url);
1684 dpavlin 37 if (
1685     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1686     !$url->host || !$url->port || $url->port < 1
1687     ) {
1688     carp "can't parse $url\n";
1689     return -1;
1690     }
1691 dpavlin 33
1692 dpavlin 59 my $ua = LWP::UserAgent->new;
1693     $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1694 dpavlin 33
1695 dpavlin 59 my $req;
1696 dpavlin 37 if ($reqbody) {
1697 dpavlin 59 $req = HTTP::Request->new(POST => $url);
1698 dpavlin 37 } else {
1699 dpavlin 59 $req = HTTP::Request->new(GET => $url);
1700 dpavlin 37 }
1701    
1702 dpavlin 59 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1703     $req->headers->header( 'Connection', 'close' );
1704 dpavlin 77 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1705 dpavlin 59 $req->content_type( $content_type );
1706 dpavlin 37
1707 dpavlin 59 warn $req->headers->as_string,"\n" if ($self->{debug});
1708 dpavlin 2
1709 dpavlin 37 if ($reqbody) {
1710 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1711 dpavlin 59 $req->content( $reqbody );
1712 dpavlin 33 }
1713 dpavlin 2
1714 dpavlin 59 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1715 dpavlin 2
1716 dpavlin 59 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1717 dpavlin 2
1718 dpavlin 76 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1719    
1720 dpavlin 78 if (! $res->is_success) {
1721 dpavlin 103 if ($croak_on_error) {
1722 dpavlin 78 croak("can't get $url: ",$res->status_line);
1723     } else {
1724     return -1;
1725     }
1726     }
1727 dpavlin 2
1728 dpavlin 59 $$resbody .= $res->content;
1729    
1730 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1731 dpavlin 39
1732 dpavlin 40 return $self->{status};
1733 dpavlin 2 }
1734    
1735 dpavlin 48
1736 dpavlin 55 =head2 set_snippet_width
1737 dpavlin 48
1738 dpavlin 55 Set width of snippets in results
1739    
1740     $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1741    
1742     C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1743     is not sent with results. If it is negative, whole document text is sent instead of snippet.
1744    
1745     C<$hwidth> specified width of strings from beginning of string. Default
1746     value is C<96>. Negative or zero value keep previous value.
1747    
1748     C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1749     If negative of zero value is provided previous value is kept unchanged.
1750    
1751     =cut
1752    
1753     sub set_snippet_width {
1754     my $self = shift;
1755    
1756     my ($wwidth, $hwidth, $awidth) = @_;
1757     $self->{wwidth} = $wwidth;
1758     $self->{hwidth} = $hwidth if ($hwidth >= 0);
1759     $self->{awidth} = $awidth if ($awidth >= 0);
1760     }
1761    
1762    
1763 dpavlin 56 =head2 set_user
1764 dpavlin 55
1765 dpavlin 56 Manage users of node
1766    
1767     $node->set_user( 'name', $mode );
1768    
1769     C<$mode> can be one of:
1770    
1771     =over 4
1772    
1773     =item 0
1774    
1775     delete account
1776    
1777     =item 1
1778    
1779     set administrative right for user
1780    
1781     =item 2
1782    
1783     set user account as guest
1784    
1785     =back
1786    
1787     Return true on success, otherwise false.
1788    
1789     =cut
1790    
1791     sub set_user {
1792     my $self = shift;
1793     my ($name, $mode) = @_;
1794    
1795     return unless ($self->{url});
1796     croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1797    
1798     $self->shuttle_url( $self->{url} . '/_set_user',
1799 dpavlin 154 'application/x-www-form-urlencoded',
1800 dpavlin 56 'name=' . uri_escape($name) . '&mode=' . $mode,
1801     undef
1802     ) == 200;
1803     }
1804    
1805    
1806 dpavlin 57 =head2 set_link
1807    
1808     Manage node links
1809    
1810     $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1811    
1812     If C<$credit> is negative, link is removed.
1813    
1814     =cut
1815    
1816     sub set_link {
1817     my $self = shift;
1818     my ($url, $label, $credit) = @_;
1819    
1820     return unless ($self->{url});
1821     croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1822    
1823     my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1824     $reqbody .= '&credit=' . $credit if ($credit > 0);
1825    
1826 dpavlin 107 if ($self->shuttle_url( $self->{url} . '/_set_link',
1827 dpavlin 71 'application/x-www-form-urlencoded',
1828 dpavlin 57 $reqbody,
1829     undef
1830 dpavlin 107 ) == 200) {
1831     # refresh node info after adding link
1832 dpavlin 151 $self->_clear_info;
1833 dpavlin 107 return 1;
1834     }
1835 dpavlin 150 return undef;
1836 dpavlin 57 }
1837    
1838 dpavlin 107 =head2 admins
1839 dpavlin 57
1840 dpavlin 107 my @admins = @{ $node->admins };
1841    
1842     Return array of users with admin rights on node
1843    
1844     =cut
1845    
1846     sub admins {
1847     my $self = shift;
1848 dpavlin 111 $self->_set_info unless ($self->{inform}->{name});
1849     return $self->{inform}->{admins};
1850 dpavlin 107 }
1851    
1852     =head2 guests
1853    
1854     my @guests = @{ $node->guests };
1855    
1856     Return array of users with guest rights on node
1857    
1858     =cut
1859    
1860     sub guests {
1861     my $self = shift;
1862 dpavlin 111 $self->_set_info unless ($self->{inform}->{name});
1863     return $self->{inform}->{guests};
1864 dpavlin 107 }
1865    
1866     =head2 links
1867    
1868     my $links = @{ $node->links };
1869    
1870     Return array of links for this node
1871    
1872     =cut
1873    
1874     sub links {
1875     my $self = shift;
1876 dpavlin 111 $self->_set_info unless ($self->{inform}->{name});
1877     return $self->{inform}->{links};
1878 dpavlin 107 }
1879    
1880 dpavlin 160 =head2 cacheusage
1881    
1882     Return cache usage for a node
1883    
1884     my $cache = $node->cacheusage;
1885    
1886     =cut
1887    
1888     sub cacheusage {
1889     my $self = shift;
1890    
1891     return unless ($self->{url});
1892    
1893     my $resbody;
1894     my $rv = $self->shuttle_url( $self->{url} . '/cacheusage',
1895     'text/plain',
1896     undef,
1897     \$resbody,
1898     );
1899    
1900     return if ($rv != 200 || !$resbody);
1901    
1902     return $resbody;
1903     }
1904    
1905 dpavlin 134 =head2 master
1906 dpavlin 107
1907 dpavlin 134 Set actions on Hyper Estraier node master (C<estmaster> process)
1908    
1909     $node->master(
1910     action => 'sync'
1911     );
1912    
1913     All available actions are documented in
1914     L<http://hyperestraier.sourceforge.net/nguide-en.html#protocol>
1915    
1916     =cut
1917    
1918     my $estmaster_rest = {
1919     shutdown => {
1920     status => 202,
1921     },
1922     sync => {
1923     status => 202,
1924     },
1925     backup => {
1926     status => 202,
1927     },
1928     userlist => {
1929     status => 200,
1930 dpavlin 135 returns => [ qw/name passwd flags fname misc/ ],
1931 dpavlin 134 },
1932     useradd => {
1933 dpavlin 135 required => [ qw/name passwd flags/ ],
1934     optional => [ qw/fname misc/ ],
1935 dpavlin 134 status => 200,
1936     },
1937     userdel => {
1938 dpavlin 135 required => [ qw/name/ ],
1939 dpavlin 134 status => 200,
1940     },
1941     nodelist => {
1942     status => 200,
1943 dpavlin 135 returns => [ qw/name label doc_num word_num size/ ],
1944 dpavlin 134 },
1945     nodeadd => {
1946 dpavlin 135 required => [ qw/name/ ],
1947     optional => [ qw/label/ ],
1948 dpavlin 134 status => 200,
1949     },
1950     nodedel => {
1951 dpavlin 135 required => [ qw/name/ ],
1952 dpavlin 134 status => 200,
1953     },
1954     nodeclr => {
1955 dpavlin 135 required => [ qw/name/ ],
1956 dpavlin 134 status => 200,
1957     },
1958     nodertt => {
1959     status => 200,
1960     },
1961     };
1962    
1963     sub master {
1964     my $self = shift;
1965    
1966     my $args = {@_};
1967    
1968     # have action?
1969     my $action = $args->{action} || croak "need action, available: ",
1970     join(", ",keys %{ $estmaster_rest });
1971    
1972     # check if action is valid
1973     my $rest = $estmaster_rest->{$action};
1974     croak "action '$action' is not supported, available actions: ",
1975     join(", ",keys %{ $estmaster_rest }) unless ($rest);
1976    
1977     croak "BUG: action '$action' needs return status" unless ($rest->{status});
1978    
1979     my @args;
1980    
1981     if ($rest->{required} || $rest->{optional}) {
1982    
1983     map {
1984     croak "need parametar '$_' for action '$action'" unless ($args->{$_});
1985     push @args, $_ . '=' . uri_escape( $args->{$_} );
1986 dpavlin 136 } ( @{ $rest->{required} } );
1987 dpavlin 134
1988     map {
1989     push @args, $_ . '=' . uri_escape( $args->{$_} ) if ($args->{$_});
1990 dpavlin 136 } ( @{ $rest->{optional} } );
1991 dpavlin 134
1992     }
1993    
1994     my $uri = new URI( $self->{url} );
1995    
1996     my $resbody;
1997    
1998 dpavlin 135 my $status = $self->shuttle_url(
1999 dpavlin 134 'http://' . $uri->host_port . '/master?action=' . $action ,
2000     'application/x-www-form-urlencoded',
2001     join('&', @args),
2002     \$resbody,
2003     1,
2004 dpavlin 135 ) or confess "shuttle_url failed";
2005 dpavlin 134
2006 dpavlin 135 if ($status == $rest->{status}) {
2007 dpavlin 149
2008     # refresh node info after sync
2009 dpavlin 151 $self->_clear_info if ($action eq 'sync' || $action =~ m/^node(?:add|del|clr)$/);
2010 dpavlin 149
2011 dpavlin 135 if ($rest->{returns} && wantarray) {
2012 dpavlin 134
2013     my @results;
2014 dpavlin 135 my $fields = $#{$rest->{returns}};
2015 dpavlin 134
2016     foreach my $line ( split(/[\r\n]/,$resbody) ) {
2017 dpavlin 135 my @e = split(/\t/, $line, $fields + 1);
2018 dpavlin 134 my $row;
2019 dpavlin 135 foreach my $i ( 0 .. $fields) {
2020     $row->{ $rest->{returns}->[$i] } = $e[ $i ];
2021     }
2022 dpavlin 134 push @results, $row;
2023     }
2024    
2025     return @results;
2026    
2027 dpavlin 135 } elsif ($resbody) {
2028 dpavlin 136 chomp $resbody;
2029 dpavlin 134 return $resbody;
2030 dpavlin 135 } else {
2031     return 0E0;
2032 dpavlin 134 }
2033     }
2034 dpavlin 135
2035     carp "expected status $rest->{status}, but got $status";
2036     return undef;
2037 dpavlin 134 }
2038    
2039 dpavlin 55 =head1 PRIVATE METHODS
2040    
2041     You could call those directly, but you don't have to. I hope.
2042    
2043     =head2 _set_info
2044    
2045 dpavlin 48 Set information for node
2046    
2047 dpavlin 55 $node->_set_info;
2048 dpavlin 48
2049     =cut
2050    
2051 dpavlin 55 sub _set_info {
2052 dpavlin 48 my $self = shift;
2053    
2054     $self->{status} = -1;
2055     return unless ($self->{url});
2056    
2057     my $resbody;
2058     my $rv = $self->shuttle_url( $self->{url} . '/inform',
2059     'text/plain',
2060     undef,
2061     \$resbody,
2062     );
2063    
2064     return if ($rv != 200 || !$resbody);
2065    
2066 dpavlin 107 my @lines = split(/[\r\n]/,$resbody);
2067 dpavlin 48
2068 dpavlin 151 $self->_clear_info;
2069 dpavlin 111
2070     ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
2071     $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
2072    
2073 dpavlin 107 return $resbody unless (@lines);
2074    
2075     shift @lines;
2076    
2077     while(my $admin = shift @lines) {
2078 dpavlin 111 push @{$self->{inform}->{admins}}, $admin;
2079 dpavlin 107 }
2080 dpavlin 111
2081 dpavlin 107 while(my $guest = shift @lines) {
2082 dpavlin 111 push @{$self->{inform}->{guests}}, $guest;
2083 dpavlin 107 }
2084    
2085     while(my $link = shift @lines) {
2086 dpavlin 111 push @{$self->{inform}->{links}}, $link;
2087 dpavlin 107 }
2088    
2089     return $resbody;
2090    
2091 dpavlin 48 }
2092    
2093 dpavlin 151 =head2 _clear_info
2094    
2095     Clear information for node
2096    
2097     $node->_clear_info;
2098    
2099     On next call to C<name>, C<label>, C<doc_num>, C<word_num> or C<size> node
2100     info will be fetch again from Hyper Estraier.
2101    
2102     =cut
2103     sub _clear_info {
2104     my $self = shift;
2105     $self->{inform} = {
2106     dnum => -1,
2107     wnum => -1,
2108     size => -1.0,
2109     };
2110     }
2111    
2112 dpavlin 2 ###
2113    
2114     =head1 EXPORT
2115    
2116     Nothing.
2117    
2118     =head1 SEE ALSO
2119    
2120     L<http://hyperestraier.sourceforge.net/>
2121    
2122     Hyper Estraier Ruby interface on which this module is based.
2123    
2124 dpavlin 164 Hyper Estraier now also has pure-perl binding included in distribution. It's
2125     a faster way to access databases directly if you are not running
2126     C<estmaster> P2P server.
2127    
2128 dpavlin 2 =head1 AUTHOR
2129    
2130     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
2131    
2132 dpavlin 128 Robert Klep E<lt>robert@klep.nameE<gt> contributed refactored search code
2133 dpavlin 2
2134     =head1 COPYRIGHT AND LICENSE
2135    
2136 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
2137 dpavlin 2
2138     This library is free software; you can redistribute it and/or modify
2139     it under the GPL v2 or later.
2140    
2141     =cut
2142    
2143     1;

  ViewVC Help
Powered by ViewVC 1.1.26