/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 53 - (hide annotations)
Fri Jan 6 14:39:45 2006 UTC (18 years, 2 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 25801 byte(s)
search work (Content-type and attributes fix), NodeResult->doc_num now
return proper number of hits (and not index of last one which isi
doc_num - 1)
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15     use Search::Estraier;
16     my $est = new Search::Estraier();
17    
18     =head1 DESCRIPTION
19    
20     This module is implementation of node API of Hyper Estraier. Since it's
21     perl-only module with dependencies only on standard perl modules, it will
22     run on all platforms on which perl runs. It doesn't require compilation
23     or Hyper Estraier development files on target machine.
24    
25     It is implemented as multiple packages which closly resamble Ruby
26     implementation. It also includes methods to manage nodes.
27    
28     =cut
29    
30 dpavlin 42 =head1 Inheritable common methods
31    
32     This methods should really move somewhere else.
33    
34 dpavlin 15 =head2 _s
35    
36     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37    
38     my $text = $self->_s(" this is a text ");
39     $text = 'this is a text';
40    
41     =cut
42    
43     sub _s {
44     my $text = $_[1] || return;
45     $text =~ s/\s\s+/ /gs;
46     $text =~ s/^\s+//;
47     $text =~ s/\s+$//;
48     return $text;
49     }
50    
51 dpavlin 2 package Search::Estraier::Document;
52    
53 dpavlin 9 use Carp qw/croak confess/;
54 dpavlin 7
55 dpavlin 15 use Search::Estraier;
56     our @ISA = qw/Search::Estraier/;
57    
58 dpavlin 2 =head1 Search::Estraier::Document
59    
60 dpavlin 14 This class implements Document which is collection of attributes
61     (key=value), vectors (also key value) display text and hidden text.
62    
63 dpavlin 42
64 dpavlin 2 =head2 new
65    
66 dpavlin 14 Create new document, empty or from draft.
67    
68 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
69 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
70 dpavlin 2
71     =cut
72    
73     sub new {
74     my $class = shift;
75 dpavlin 14 my $self = {};
76 dpavlin 2 bless($self, $class);
77    
78 dpavlin 6 $self->{id} = -1;
79    
80 dpavlin 14 my $draft = shift;
81    
82     if ($draft) {
83     my $in_text = 0;
84     foreach my $line (split(/\n/, $draft)) {
85    
86     if ($in_text) {
87     if ($line =~ /^\t/) {
88     push @{ $self->{htexts} }, substr($line, 1);
89     } else {
90     push @{ $self->{dtexts} }, $line;
91     }
92     next;
93     }
94    
95     if ($line =~ m/^%VECTOR\t(.+)$/) {
96     my @fields = split(/\t/, $1);
97     for my $i ( 0 .. ($#fields - 1) ) {
98     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99     $i++;
100     }
101     next;
102     } elsif ($line =~ m/^%/) {
103     # What is this? comment?
104     #warn "$line\n";
105     next;
106     } elsif ($line =~ m/^$/) {
107     $in_text = 1;
108     next;
109     } elsif ($line =~ m/^(.+)=(.+)$/) {
110     $self->{attrs}->{ $1 } = $2;
111     next;
112     }
113    
114     warn "draft ignored: $line\n";
115     }
116     }
117    
118 dpavlin 2 $self ? return $self : return undef;
119     }
120    
121 dpavlin 4
122 dpavlin 2 =head2 add_attr
123    
124 dpavlin 6 Add an attribute.
125    
126 dpavlin 2 $doc->add_attr( name => 'value' );
127    
128 dpavlin 9 Delete attribute using
129 dpavlin 5
130     $doc->add_attr( name => undef );
131    
132 dpavlin 2 =cut
133    
134     sub add_attr {
135     my $self = shift;
136     my $attrs = {@_};
137    
138     while (my ($name, $value) = each %{ $attrs }) {
139 dpavlin 9 if (! defined($value)) {
140 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
141 dpavlin 9 } else {
142 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 dpavlin 9 }
144 dpavlin 2 }
145 dpavlin 8
146     return 1;
147 dpavlin 2 }
148    
149 dpavlin 5
150     =head2 add_text
151    
152 dpavlin 6 Add a sentence of text.
153    
154 dpavlin 5 $doc->add_text('this is example text to display');
155    
156     =cut
157    
158     sub add_text {
159     my $self = shift;
160     my $text = shift;
161     return unless defined($text);
162    
163 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
164 dpavlin 5 }
165    
166    
167     =head2 add_hidden_text
168    
169 dpavlin 6 Add a hidden sentence.
170    
171 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
172    
173     =cut
174    
175     sub add_hidden_text {
176     my $self = shift;
177     my $text = shift;
178     return unless defined($text);
179    
180 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
181 dpavlin 5 }
182    
183 dpavlin 42
184 dpavlin 6 =head2 id
185    
186     Get the ID number of document. If the object has never been registred, C<-1> is returned.
187    
188     print $doc->id;
189    
190     =cut
191    
192     sub id {
193     my $self = shift;
194     return $self->{id};
195     }
196    
197 dpavlin 42
198 dpavlin 7 =head2 attr_names
199    
200 dpavlin 9 Returns array with attribute names from document object.
201 dpavlin 7
202     my @attrs = $doc->attr_names;
203    
204     =cut
205    
206     sub attr_names {
207     my $self = shift;
208 dpavlin 9 croak "attr_names return array, not scalar" if (! wantarray);
209 dpavlin 7 return sort keys %{ $self->{attrs} };
210     }
211    
212 dpavlin 8
213     =head2 attr
214    
215 dpavlin 9 Returns value of an attribute.
216 dpavlin 8
217     my $value = $doc->attr( 'attribute' );
218    
219     =cut
220    
221     sub attr {
222     my $self = shift;
223     my $name = shift;
224    
225     return $self->{'attrs'}->{ $name };
226     }
227    
228 dpavlin 9
229     =head2 texts
230    
231     Returns array with text sentences.
232    
233     my @texts = $doc->texts;
234    
235     =cut
236    
237     sub texts {
238     my $self = shift;
239 dpavlin 12 confess "texts return array, not scalar" if (! wantarray);
240 dpavlin 11 return @{ $self->{dtexts} };
241 dpavlin 9 }
242    
243 dpavlin 42
244 dpavlin 12 =head2 cat_texts
245    
246     Return whole text as single scalar.
247    
248     my $text = $doc->cat_texts;
249    
250     =cut
251    
252     sub cat_texts {
253     my $self = shift;
254     return join(' ',@{ $self->{dtexts} });
255     }
256    
257 dpavlin 42
258 dpavlin 5 =head2 dump_draft
259    
260 dpavlin 13 Dump draft data from document object.
261    
262 dpavlin 5 print $doc->dump_draft;
263    
264     =cut
265    
266     sub dump_draft {
267 dpavlin 13 my $self = shift;
268     my $draft;
269    
270     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
271     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
272     }
273    
274     if ($self->{kwords}) {
275     $draft .= '%%VECTOR';
276     while (my ($key, $value) = each %{ $self->{kwords} }) {
277     $draft .= "\t$key\t$value";
278     }
279     $draft .= "\n";
280     }
281    
282     $draft .= "\n";
283    
284 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
285     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
286 dpavlin 13
287     return $draft;
288 dpavlin 5 }
289    
290 dpavlin 42
291 dpavlin 4 =head2 delete
292 dpavlin 2
293 dpavlin 4 Empty document object
294 dpavlin 2
295 dpavlin 4 $doc->delete;
296    
297 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
298     convinience. Document objects which go out of scope will be destroyed
299     automatically.
300    
301 dpavlin 4 =cut
302    
303     sub delete {
304     my $self = shift;
305    
306 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
307 dpavlin 5 delete($self->{$data});
308     }
309 dpavlin 4
310 dpavlin 10 $self->{id} = -1;
311    
312 dpavlin 4 return 1;
313     }
314    
315    
316    
317 dpavlin 15 package Search::Estraier::Condition;
318 dpavlin 4
319 dpavlin 16 use Carp qw/confess croak/;
320    
321 dpavlin 15 use Search::Estraier;
322     our @ISA = qw/Search::Estraier/;
323 dpavlin 4
324 dpavlin 16 =head1 Search::Estraier::Condition
325    
326     =head2 new
327    
328     my $cond = new Search::HyperEstraier::Condition;
329    
330     =cut
331    
332     sub new {
333     my $class = shift;
334     my $self = {};
335     bless($self, $class);
336    
337 dpavlin 19 $self->{max} = -1;
338     $self->{options} = 0;
339    
340 dpavlin 16 $self ? return $self : return undef;
341     }
342    
343 dpavlin 42
344 dpavlin 16 =head2 set_phrase
345    
346     $cond->set_phrase('search phrase');
347    
348     =cut
349    
350     sub set_phrase {
351     my $self = shift;
352     $self->{phrase} = $self->_s( shift );
353     }
354    
355 dpavlin 42
356 dpavlin 16 =head2 add_attr
357    
358     $cond->add_attr('@URI STRINC /~dpavlin/');
359    
360     =cut
361    
362     sub add_attr {
363     my $self = shift;
364     my $attr = shift || return;
365     push @{ $self->{attrs} }, $self->_s( $attr );
366     }
367    
368 dpavlin 42
369 dpavlin 16 =head2 set_order
370    
371     $cond->set_order('@mdate NUMD');
372    
373     =cut
374    
375     sub set_order {
376     my $self = shift;
377     $self->{order} = shift;
378     }
379    
380 dpavlin 42
381 dpavlin 16 =head2 set_max
382    
383     $cond->set_max(42);
384    
385     =cut
386    
387     sub set_max {
388     my $self = shift;
389     my $max = shift;
390 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
391 dpavlin 16 $self->{max} = $max;
392     }
393    
394 dpavlin 42
395 dpavlin 16 =head2 set_options
396    
397     $cond->set_options( SURE => 1 );
398    
399     =cut
400    
401 dpavlin 15 my $options = {
402     # check N-gram keys skipping by three
403     SURE => 1 << 0,
404     # check N-gram keys skipping by two
405     USUAL => 1 << 1,
406     # without TF-IDF tuning
407     FAST => 1 << 2,
408     # with the simplified phrase
409     AGITO => 1 << 3,
410     # check every N-gram key
411     NOIDF => 1 << 4,
412     # check N-gram keys skipping by one
413     SIMPLE => 1 << 10,
414     };
415    
416 dpavlin 16 sub set_options {
417     my $self = shift;
418     my $option = shift;
419     confess "unknown option" unless ($options->{$option});
420     $self->{options} ||= $options->{$option};
421 dpavlin 4 }
422    
423 dpavlin 42
424 dpavlin 18 =head2 phrase
425    
426     Return search phrase.
427    
428     print $cond->phrase;
429    
430     =cut
431    
432     sub phrase {
433     my $self = shift;
434     return $self->{phrase};
435     }
436    
437 dpavlin 42
438 dpavlin 19 =head2 order
439 dpavlin 18
440 dpavlin 19 Return search result order.
441    
442     print $cond->order;
443    
444     =cut
445    
446     sub order {
447     my $self = shift;
448     return $self->{order};
449     }
450    
451 dpavlin 42
452 dpavlin 19 =head2 attrs
453    
454     Return search result attrs.
455    
456     my @cond_attrs = $cond->attrs;
457    
458     =cut
459    
460     sub attrs {
461     my $self = shift;
462     #croak "attrs return array, not scalar" if (! wantarray);
463     return @{ $self->{attrs} };
464     }
465    
466 dpavlin 42
467 dpavlin 19 =head2 max
468    
469     Return maximum number of results.
470    
471     print $cond->max;
472    
473     C<-1> is returned for unitialized value, C<0> is unlimited.
474    
475     =cut
476    
477     sub max {
478     my $self = shift;
479     return $self->{max};
480     }
481    
482 dpavlin 42
483 dpavlin 19 =head2 options
484    
485     Return options for this condition.
486    
487     print $cond->options;
488    
489     Options are returned in numerical form.
490    
491     =cut
492    
493     sub options {
494     my $self = shift;
495     return $self->{options};
496     }
497    
498    
499 dpavlin 20 package Search::Estraier::ResultDocument;
500    
501 dpavlin 24 use Carp qw/croak/;
502 dpavlin 20
503 dpavlin 24 #use Search::Estraier;
504     #our @ISA = qw/Search::Estraier/;
505 dpavlin 20
506     =head1 Search::Estraier::ResultDocument
507    
508     =head2 new
509    
510 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
511 dpavlin 20 uri => 'http://localhost/document/uri/42',
512     attrs => {
513     foo => 1,
514     bar => 2,
515     },
516     snippet => 'this is a text of snippet'
517     keywords => 'this\tare\tkeywords'
518     );
519    
520     =cut
521    
522     sub new {
523     my $class = shift;
524     my $self = {@_};
525     bless($self, $class);
526    
527     foreach my $f (qw/uri attrs snippet keywords/) {
528     croak "missing $f for ResultDocument" unless defined($self->{$f});
529     }
530    
531     $self ? return $self : return undef;
532     }
533    
534 dpavlin 42
535 dpavlin 23 =head2 uri
536 dpavlin 20
537 dpavlin 23 Return URI of result document
538 dpavlin 20
539 dpavlin 23 print $rdoc->uri;
540    
541     =cut
542    
543     sub uri {
544     my $self = shift;
545     return $self->{uri};
546     }
547    
548    
549     =head2 attr_names
550    
551     Returns array with attribute names from result document object.
552    
553     my @attrs = $rdoc->attr_names;
554    
555     =cut
556    
557     sub attr_names {
558     my $self = shift;
559     croak "attr_names return array, not scalar" if (! wantarray);
560     return sort keys %{ $self->{attrs} };
561     }
562    
563 dpavlin 42
564 dpavlin 23 =head2 attr
565    
566     Returns value of an attribute.
567    
568     my $value = $rdoc->attr( 'attribute' );
569    
570     =cut
571    
572     sub attr {
573     my $self = shift;
574     my $name = shift || return;
575     return $self->{attrs}->{ $name };
576     }
577    
578 dpavlin 42
579 dpavlin 23 =head2 snippet
580    
581     Return snippet from result document
582    
583     print $rdoc->snippet;
584    
585     =cut
586    
587     sub snippet {
588     my $self = shift;
589     return $self->{snippet};
590     }
591    
592 dpavlin 42
593 dpavlin 23 =head2 keywords
594    
595     Return keywords from result document
596    
597     print $rdoc->keywords;
598    
599     =cut
600    
601     sub keywords {
602     my $self = shift;
603     return $self->{keywords};
604     }
605    
606    
607 dpavlin 25 package Search::Estraier::NodeResult;
608    
609     use Carp qw/croak/;
610    
611     #use Search::Estraier;
612     #our @ISA = qw/Search::Estraier/;
613    
614     =head1 Search::Estraier::NodeResult
615    
616     =head2 new
617    
618     my $res = new Search::HyperEstraier::NodeResult(
619     docs => @array_of_rdocs,
620     hits => %hash_with_hints,
621     );
622    
623     =cut
624    
625     sub new {
626     my $class = shift;
627     my $self = {@_};
628     bless($self, $class);
629    
630     foreach my $f (qw/docs hints/) {
631     croak "missing $f for ResultDocument" unless defined($self->{$f});
632     }
633    
634     $self ? return $self : return undef;
635     }
636    
637 dpavlin 42
638 dpavlin 25 =head2 doc_num
639    
640     Return number of documents
641    
642     print $res->doc_num;
643    
644     =cut
645    
646     sub doc_num {
647     my $self = shift;
648 dpavlin 53 return $#{$self->{docs}} + 1;
649 dpavlin 25 }
650    
651 dpavlin 42
652 dpavlin 25 =head2 get_doc
653    
654     Return single document
655    
656     my $doc = $res->get_doc( 42 );
657    
658     Returns undef if document doesn't exist.
659    
660     =cut
661    
662     sub get_doc {
663     my $self = shift;
664     my $num = shift;
665 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
666 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
667     return $self->{docs}->[$num];
668     }
669    
670 dpavlin 42
671 dpavlin 25 =head2 hint
672    
673     Return specific hint from results.
674    
675     print $rec->hint( 'VERSION' );
676    
677     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
678     C<TIME>, C<LINK#n>, C<VIEW>.
679    
680     =cut
681    
682     sub hint {
683     my $self = shift;
684     my $key = shift || return;
685     return $self->{hints}->{$key};
686     }
687    
688    
689 dpavlin 27 package Search::Estraier::Node;
690    
691 dpavlin 44 use Carp qw/carp croak confess/;
692 dpavlin 33 use URI;
693 dpavlin 36 use MIME::Base64;
694 dpavlin 33 use IO::Socket::INET;
695 dpavlin 49 use URI::Escape qw/uri_escape/;
696 dpavlin 29
697 dpavlin 27 =head1 Search::Estraier::Node
698    
699     =head2 new
700    
701     my $node = new Search::HyperEstraier::Node;
702    
703     =cut
704    
705     sub new {
706     my $class = shift;
707     my $self = {
708     pxport => -1,
709 dpavlin 33 timeout => 0, # this used to be -1
710 dpavlin 27 dnum => -1,
711     wnum => -1,
712     size => -1.0,
713     wwidth => 480,
714     hwidth => 96,
715     awidth => 96,
716     status => -1,
717     };
718     bless($self, $class);
719    
720 dpavlin 39 if (@_) {
721 dpavlin 41 $self->{debug} = shift;
722 dpavlin 39 warn "## Node debug on\n";
723     }
724    
725 dpavlin 27 $self ? return $self : return undef;
726     }
727    
728 dpavlin 42
729 dpavlin 29 =head2 set_url
730    
731     Specify URL to node server
732    
733     $node->set_url('http://localhost:1978');
734    
735     =cut
736    
737     sub set_url {
738     my $self = shift;
739     $self->{url} = shift;
740     }
741    
742 dpavlin 42
743 dpavlin 29 =head2 set_proxy
744    
745     Specify proxy server to connect to node server
746    
747     $node->set_proxy('proxy.example.com', 8080);
748    
749     =cut
750    
751     sub set_proxy {
752     my $self = shift;
753     my ($host,$port) = @_;
754 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
755 dpavlin 29 $self->{pxhost} = $host;
756     $self->{pxport} = $port;
757     }
758    
759 dpavlin 42
760 dpavlin 30 =head2 set_timeout
761    
762     Specify timeout of connection in seconds
763    
764     $node->set_timeout( 15 );
765    
766     =cut
767    
768     sub set_timeout {
769     my $self = shift;
770     my $sec = shift;
771 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
772 dpavlin 30 $self->{timeout} = $sec;
773     }
774    
775 dpavlin 42
776 dpavlin 31 =head2 set_auth
777    
778     Specify name and password for authentication to node server.
779    
780     $node->set_auth('clint','eastwood');
781    
782     =cut
783    
784     sub set_auth {
785     my $self = shift;
786     my ($login,$passwd) = @_;
787 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
788     chomp($basic_auth);
789     $self->{auth} = $basic_auth;
790 dpavlin 31 }
791    
792 dpavlin 42
793 dpavlin 32 =head2 status
794    
795     Return status code of last request.
796    
797 dpavlin 40 print $node->status;
798 dpavlin 32
799     C<-1> means connection failure.
800    
801     =cut
802    
803     sub status {
804     my $self = shift;
805     return $self->{status};
806     }
807    
808 dpavlin 42
809 dpavlin 40 =head2 put_doc
810    
811 dpavlin 41 Add a document
812 dpavlin 40
813 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
814    
815     Return true on success or false on failture.
816    
817 dpavlin 40 =cut
818    
819     sub put_doc {
820     my $self = shift;
821     my $doc = shift || return;
822 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
823 dpavlin 41 $self->shuttle_url( $self->{url} . '/put_doc',
824     'text/x-estraier-draft',
825     $doc->dump_draft,
826     undef
827     ) == 200;
828 dpavlin 40 }
829    
830 dpavlin 41
831     =head2 out_doc
832    
833     Remove a document
834    
835     $node->out_doc( document_id ) or "can't remove document";
836    
837     Return true on success or false on failture.
838    
839     =cut
840    
841     sub out_doc {
842     my $self = shift;
843     my $id = shift || return;
844     return unless ($self->{url});
845 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
846 dpavlin 41 $self->shuttle_url( $self->{url} . '/out_doc',
847     'application/x-www-form-urlencoded',
848     "id=$id",
849     undef
850     ) == 200;
851     }
852    
853    
854     =head2 out_doc_by_uri
855    
856     Remove a registrated document using it's uri
857    
858 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
859 dpavlin 41
860     Return true on success or false on failture.
861    
862     =cut
863    
864     sub out_doc_by_uri {
865     my $self = shift;
866     my $uri = shift || return;
867     return unless ($self->{url});
868     $self->shuttle_url( $self->{url} . '/out_doc',
869     'application/x-www-form-urlencoded',
870 dpavlin 50 "uri=" . uri_escape($uri),
871 dpavlin 41 undef
872     ) == 200;
873     }
874    
875 dpavlin 42
876     =head2 edit_doc
877    
878     Edit attributes of a document
879    
880     $node->edit_doc( $document_draft ) or die "can't edit document";
881    
882     Return true on success or false on failture.
883    
884     =cut
885    
886     sub edit_doc {
887     my $self = shift;
888     my $doc = shift || return;
889 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
890 dpavlin 42 $self->shuttle_url( $self->{url} . '/edit_doc',
891     'text/x-estraier-draft',
892     $doc->dump_draft,
893     undef
894     ) == 200;
895     }
896    
897    
898 dpavlin 43 =head2 get_doc
899    
900     Retreive document
901    
902     my $doc = $node->get_doc( document_id ) or die "can't get document";
903    
904     Return true on success or false on failture.
905    
906     =cut
907    
908     sub get_doc {
909     my $self = shift;
910     my $id = shift || return;
911     return $self->_fetch_doc( id => $id );
912     }
913    
914 dpavlin 44
915 dpavlin 43 =head2 get_doc_by_uri
916    
917     Retreive document
918    
919 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
920 dpavlin 43
921     Return true on success or false on failture.
922    
923     =cut
924    
925     sub get_doc_by_uri {
926     my $self = shift;
927     my $uri = shift || return;
928     return $self->_fetch_doc( uri => $uri );
929     }
930    
931 dpavlin 44
932 dpavlin 49 =head2 get_doc_attr
933    
934     Retrieve the value of an atribute from object
935    
936     my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
937     die "can't get document attribute";
938    
939     =cut
940    
941     sub get_doc_attr {
942     my $self = shift;
943     my ($id,$name) = @_;
944     return unless ($id && $name);
945     return $self->_fetch_doc( id => $id, attr => $name );
946     }
947    
948    
949     =head2 get_doc_attr_by_uri
950    
951     Retrieve the value of an atribute from object
952    
953     my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
954     die "can't get document attribute";
955    
956     =cut
957    
958     sub get_doc_attr_by_uri {
959     my $self = shift;
960     my ($uri,$name) = @_;
961     return unless ($uri && $name);
962     return $self->_fetch_doc( uri => $uri, attr => $name );
963     }
964    
965    
966 dpavlin 44 =head2 etch_doc
967    
968     Exctract document keywords
969    
970     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
971    
972     =cut
973    
974 dpavlin 49 sub etch_doc {
975 dpavlin 44 my $self = shift;
976     my $id = shift || return;
977     return $self->_fetch_doc( id => $id, etch => 1 );
978     }
979    
980     =head2 etch_doc_by_uri
981    
982     Retreive document
983    
984 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
985 dpavlin 44
986     Return true on success or false on failture.
987    
988     =cut
989    
990     sub etch_doc_by_uri {
991     my $self = shift;
992     my $uri = shift || return;
993     return $self->_fetch_doc( uri => $uri, etch => 1 );
994     }
995    
996    
997 dpavlin 45 =head2 uri_to_id
998    
999     Get ID of document specified by URI
1000    
1001     my $id = $node->uri_to_id( 'file:///document/uri/42' );
1002    
1003     =cut
1004    
1005     sub uri_to_id {
1006     my $self = shift;
1007     my $uri = shift || return;
1008     return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1009     }
1010    
1011    
1012 dpavlin 43 =head2 _fetch_doc
1013    
1014 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1015     C<etch_doc>, C<etch_doc_by_uri>.
1016 dpavlin 43
1017 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
1018     my $doc = $node->_fetch_doc( id => 42 );
1019     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1020 dpavlin 43
1021 dpavlin 45 # to extract keywords, add etch
1022     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1023     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1024    
1025 dpavlin 49 # to get document attrubute add attr
1026     my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1027     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1028    
1029 dpavlin 45 # more general form which allows implementation of
1030     # uri_to_id
1031     my $id = $node->_fetch_doc(
1032     uri => 'file:///document/uri/42',
1033     path => '/uri_to_id',
1034     chomp_resbody => 1
1035     );
1036    
1037 dpavlin 43 =cut
1038    
1039     sub _fetch_doc {
1040     my $self = shift;
1041 dpavlin 44 my $a = {@_};
1042     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1043    
1044     my ($arg, $resbody);
1045    
1046 dpavlin 45 my $path = $a->{path} || '/get_doc';
1047 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1048    
1049     if ($a->{id}) {
1050     croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1051     $arg = 'id=' . $a->{id};
1052     } elsif ($a->{uri}) {
1053 dpavlin 50 $arg = 'uri=' . uri_escape($a->{uri});
1054 dpavlin 44 } else {
1055     confess "unhandled argument. Need id or uri.";
1056 dpavlin 43 }
1057 dpavlin 44
1058 dpavlin 49 if ($a->{attr}) {
1059     $path = '/get_doc_attr';
1060     $arg .= '&attr=' . uri_escape($a->{attr});
1061     $a->{chomp_resbody} = 1;
1062     }
1063    
1064 dpavlin 44 my $rv = $self->shuttle_url( $self->{url} . $path,
1065 dpavlin 43 'application/x-www-form-urlencoded',
1066 dpavlin 44 $arg,
1067 dpavlin 45 \$resbody,
1068 dpavlin 43 );
1069 dpavlin 44
1070 dpavlin 43 return if ($rv != 200);
1071 dpavlin 44
1072     if ($a->{etch}) {
1073     $self->{kwords} = {};
1074     return +{} unless ($resbody);
1075     foreach my $l (split(/\n/, $resbody)) {
1076     my ($k,$v) = split(/\t/, $l, 2);
1077     $self->{kwords}->{$k} = $v if ($v);
1078     }
1079     return $self->{kwords};
1080 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1081     return unless (defined($resbody));
1082     chomp($resbody);
1083     return $resbody;
1084 dpavlin 44 } else {
1085     return new Search::Estraier::Document($resbody);
1086     }
1087 dpavlin 43 }
1088    
1089    
1090 dpavlin 48 =head2 name
1091 dpavlin 43
1092 dpavlin 48 my $node_name = $node->name;
1093 dpavlin 43
1094 dpavlin 48 =cut
1095    
1096     sub name {
1097     my $self = shift;
1098     $self->set_info unless ($self->{name});
1099     return $self->{name};
1100     }
1101    
1102    
1103     =head2 label
1104    
1105     my $node_label = $node->label;
1106    
1107     =cut
1108    
1109     sub label {
1110     my $self = shift;
1111     $self->set_info unless ($self->{label});
1112     return $self->{label};
1113     }
1114    
1115    
1116     =head2 doc_num
1117    
1118     my $documents_in_node = $node->doc_num;
1119    
1120     =cut
1121    
1122     sub doc_num {
1123     my $self = shift;
1124     $self->set_info if ($self->{dnum} < 0);
1125     return $self->{dnum};
1126     }
1127    
1128    
1129     =head2 word_num
1130    
1131     my $words_in_node = $node->word_num;
1132    
1133     =cut
1134    
1135     sub word_num {
1136     my $self = shift;
1137     $self->set_info if ($self->{wnum} < 0);
1138     return $self->{wnum};
1139     }
1140    
1141    
1142     =head2 size
1143    
1144     my $node_size = $node->size;
1145    
1146     =cut
1147    
1148     sub size {
1149     my $self = shift;
1150     $self->set_info if ($self->{size} < 0);
1151     return $self->{size};
1152     }
1153    
1154    
1155 dpavlin 51 =head2 search
1156 dpavlin 48
1157 dpavlin 51 Search documents which match condition
1158    
1159     my $nres = $node->search( $cond, $depth );
1160    
1161     C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1162     depth for meta search.
1163    
1164     Function results C<Search::Estraier::NodeResult> object.
1165    
1166     =cut
1167    
1168     sub search {
1169     my $self = shift;
1170     my ($cond, $depth) = @_;
1171     return unless ($cond && defined($depth) && $self->{url});
1172     croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1173     croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1174    
1175 dpavlin 52 my $resbody;
1176 dpavlin 51
1177 dpavlin 52 my $rv = $self->shuttle_url( $self->{url} . '/search',
1178 dpavlin 53 'application/x-www-form-urlencoded',
1179 dpavlin 52 $self->cond_to_query( $cond ),
1180     \$resbody,
1181     );
1182     return if ($rv != 200);
1183    
1184     my (@docs, $hints);
1185    
1186     my @lines = split(/\n/, $resbody);
1187     return unless (@lines);
1188    
1189     my $border = $lines[0];
1190     my $isend = 0;
1191     my $lnum = 1;
1192    
1193     while ( $lnum <= $#lines ) {
1194     my $line = $lines[$lnum];
1195     $lnum++;
1196    
1197     #warn "## $line\n";
1198     if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1199     $isend = $1;
1200     last;
1201     }
1202    
1203     if ($line =~ /\t/) {
1204     my ($k,$v) = split(/\t/, $line, 2);
1205     $hints->{$k} = $v;
1206     }
1207     }
1208    
1209     my $snum = $lnum;
1210    
1211     while( ! $isend && $lnum <= $#lines ) {
1212     my $line = $lines[$lnum];
1213 dpavlin 53 #warn "# $lnum: $line\n";
1214 dpavlin 52 $lnum++;
1215    
1216     if ($line && $line =~ m/^\Q$border\E/) {
1217     if ($lnum > $snum) {
1218     my $rdattrs;
1219     my $rdvector;
1220     my $rdsnippet;
1221    
1222     my $rlnum = $snum;
1223     while ($rlnum < $lnum - 1 ) {
1224     #my $rdline = $self->_s($lines[$rlnum]);
1225     my $rdline = $lines[$rlnum];
1226     $rlnum++;
1227     last unless ($rdline);
1228     if ($rdline =~ /^%/) {
1229     $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1230 dpavlin 53 } elsif($rdline =~ /=/) {
1231     $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1232 dpavlin 52 } else {
1233 dpavlin 53 confess "invalid format of response";
1234 dpavlin 52 }
1235     }
1236     while($rlnum < $lnum - 1) {
1237     my $rdline = $lines[$rlnum];
1238     $rlnum++;
1239     $rdsnippet .= "$rdline\n";
1240     }
1241 dpavlin 53 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1242 dpavlin 52 if (my $rduri = $rdattrs->{'@uri'}) {
1243     push @docs, new Search::Estraier::ResultDocument(
1244     uri => $rduri,
1245     attrs => $rdattrs,
1246     snippet => $rdsnippet,
1247     keywords => $rdvector,
1248     );
1249     }
1250     }
1251     $snum = $lnum;
1252     #warn "### $line\n";
1253     $isend = 1 if ($line =~ /:END$/);
1254     }
1255    
1256     }
1257    
1258     if (! $isend) {
1259     warn "received result doesn't have :END\n$resbody";
1260     return;
1261     }
1262    
1263 dpavlin 53 #warn Dumper(\@docs, $hints);
1264    
1265 dpavlin 52 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1266 dpavlin 51 }
1267    
1268    
1269     =head2 cond_to_query
1270    
1271     my $args = $node->cond_to_query( $cond );
1272    
1273     =cut
1274    
1275     sub cond_to_query {
1276     my $self = shift;
1277    
1278     my $cond = shift || return;
1279     croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1280    
1281     my @args;
1282    
1283     if (my $phrase = $cond->phrase) {
1284     push @args, 'phrase=' . uri_escape($phrase);
1285     }
1286    
1287     if (my @attrs = $cond->attrs) {
1288     for my $i ( 0 .. $#attrs ) {
1289     push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );
1290     }
1291     }
1292    
1293     if (my $order = $cond->order) {
1294     push @args, 'order=' . uri_escape($order);
1295     }
1296    
1297     if (my $max = $cond->max) {
1298     push @args, 'max=' . $max;
1299     } else {
1300     push @args, 'max=' . (1 << 30);
1301     }
1302    
1303     if (my $options = $cond->options) {
1304     push @args, 'options=' . $options;
1305     }
1306    
1307     push @args, 'depth=' . $self->{depth} if ($self->{depth});
1308     push @args, 'wwidth=' . $self->{wwidth};
1309     push @args, 'hwidth=' . $self->{hwidth};
1310     push @args, 'awidth=' . $self->{awidth};
1311    
1312     return join('&', @args);
1313     }
1314    
1315    
1316 dpavlin 33 =head2 shuttle_url
1317 dpavlin 32
1318 dpavlin 33 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1319     master.
1320 dpavlin 2
1321 dpavlin 52 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1322 dpavlin 2
1323 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1324     body will be saved within object.
1325 dpavlin 2
1326     =cut
1327    
1328 dpavlin 33 sub shuttle_url {
1329     my $self = shift;
1330 dpavlin 2
1331 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
1332 dpavlin 2
1333 dpavlin 40 $self->{status} = -1;
1334 dpavlin 33
1335 dpavlin 41 warn "## $url\n" if ($self->{debug});
1336 dpavlin 36
1337 dpavlin 33 $url = new URI($url);
1338 dpavlin 37 if (
1339     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1340     !$url->host || !$url->port || $url->port < 1
1341     ) {
1342     carp "can't parse $url\n";
1343     return -1;
1344     }
1345 dpavlin 33
1346     my ($host,$port,$query) = ($url->host, $url->port, $url->path);
1347    
1348     if ($self->{pxhost}) {
1349     ($host,$port) = ($self->{pxhost}, $self->{pxport});
1350     $query = "http://$host:$port/$query";
1351 dpavlin 2 }
1352    
1353 dpavlin 37 $query .= '?' . $url->query if ($url->query && ! $reqbody);
1354 dpavlin 2
1355 dpavlin 37 my $headers;
1356    
1357     if ($reqbody) {
1358     $headers .= "POST $query HTTP/1.0\r\n";
1359     } else {
1360     $headers .= "GET $query HTTP/1.0\r\n";
1361     }
1362    
1363 dpavlin 40 $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n";
1364 dpavlin 37 $headers .= "Connection: close\r\n";
1365     $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";
1366 dpavlin 40 $headers .= "Content-Type: $content_type\r\n";
1367 dpavlin 37 $headers .= "Authorization: Basic $self->{auth}\r\n";
1368     my $len = 0;
1369     {
1370     use bytes;
1371     $len = length($reqbody) if ($reqbody);
1372     }
1373     $headers .= "Content-Length: $len\r\n";
1374     $headers .= "\r\n";
1375    
1376 dpavlin 33 my $sock = IO::Socket::INET->new(
1377     PeerAddr => $host,
1378     PeerPort => $port,
1379     Proto => 'tcp',
1380     Timeout => $self->{timeout} || 90,
1381 dpavlin 37 );
1382 dpavlin 2
1383 dpavlin 37 if (! $sock) {
1384     carp "can't open socket to $host:$port";
1385     return -1;
1386 dpavlin 33 }
1387 dpavlin 2
1388 dpavlin 40 warn $headers if ($self->{debug});
1389 dpavlin 39
1390 dpavlin 37 print $sock $headers or
1391     carp "can't send headers to network:\n$headers\n" and return -1;
1392    
1393     if ($reqbody) {
1394 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1395 dpavlin 40 print $sock $reqbody or
1396 dpavlin 37 carp "can't send request body to network:\n$$reqbody\n" and return -1;
1397 dpavlin 33 }
1398 dpavlin 2
1399 dpavlin 33 my $line = <$sock>;
1400     chomp($line);
1401     my ($schema, $res_status, undef) = split(/ */, $line, 3);
1402     return if ($schema !~ /^HTTP/ || ! $res_status);
1403 dpavlin 2
1404 dpavlin 40 $self->{status} = $res_status;
1405 dpavlin 39 warn "## response status: $res_status\n" if ($self->{debug});
1406 dpavlin 2
1407 dpavlin 33 # skip rest of headers
1408 dpavlin 38 $line = <$sock>;
1409     while ($line) {
1410 dpavlin 33 $line = <$sock>;
1411 dpavlin 38 $line =~ s/[\r\n]+$//;
1412 dpavlin 40 warn "## ", $line || 'NULL', " ##\n" if ($self->{debug});
1413 dpavlin 38 };
1414 dpavlin 2
1415 dpavlin 33 # read body
1416 dpavlin 38 $len = 0;
1417 dpavlin 33 do {
1418     $len = read($sock, my $buf, 8192);
1419     $$resbody .= $buf if ($resbody);
1420     } while ($len);
1421    
1422 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1423 dpavlin 39
1424 dpavlin 40 return $self->{status};
1425 dpavlin 2 }
1426    
1427 dpavlin 48
1428     =head2 set_info
1429    
1430     Set information for node
1431    
1432     $node->set_info;
1433    
1434     =cut
1435    
1436     sub set_info {
1437     my $self = shift;
1438    
1439     $self->{status} = -1;
1440     return unless ($self->{url});
1441    
1442     my $resbody;
1443     my $rv = $self->shuttle_url( $self->{url} . '/inform',
1444     'text/plain',
1445     undef,
1446     \$resbody,
1447     );
1448    
1449     return if ($rv != 200 || !$resbody);
1450    
1451     chomp($resbody);
1452    
1453     ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1454     split(/\t/, $resbody, 5);
1455    
1456     }
1457    
1458 dpavlin 2 ###
1459    
1460     =head1 EXPORT
1461    
1462     Nothing.
1463    
1464     =head1 SEE ALSO
1465    
1466     L<http://hyperestraier.sourceforge.net/>
1467    
1468     Hyper Estraier Ruby interface on which this module is based.
1469    
1470     =head1 AUTHOR
1471    
1472     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1473    
1474    
1475     =head1 COPYRIGHT AND LICENSE
1476    
1477 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1478 dpavlin 2
1479     This library is free software; you can redistribute it and/or modify
1480     it under the GPL v2 or later.
1481    
1482     =cut
1483    
1484     1;

  ViewVC Help
Powered by ViewVC 1.1.26