/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 48 - (hide annotations)
Fri Jan 6 02:07:10 2006 UTC (18 years, 3 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 21239 byte(s)
added name, label, doc_num, word_num and size properties for which I had to
implement set_info.
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15     use Search::Estraier;
16     my $est = new Search::Estraier();
17    
18     =head1 DESCRIPTION
19    
20     This module is implementation of node API of Hyper Estraier. Since it's
21     perl-only module with dependencies only on standard perl modules, it will
22     run on all platforms on which perl runs. It doesn't require compilation
23     or Hyper Estraier development files on target machine.
24    
25     It is implemented as multiple packages which closly resamble Ruby
26     implementation. It also includes methods to manage nodes.
27    
28     =cut
29    
30 dpavlin 42 =head1 Inheritable common methods
31    
32     This methods should really move somewhere else.
33    
34 dpavlin 15 =head2 _s
35    
36     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37    
38     my $text = $self->_s(" this is a text ");
39     $text = 'this is a text';
40    
41     =cut
42    
43     sub _s {
44     my $text = $_[1] || return;
45     $text =~ s/\s\s+/ /gs;
46     $text =~ s/^\s+//;
47     $text =~ s/\s+$//;
48     return $text;
49     }
50    
51 dpavlin 2 package Search::Estraier::Document;
52    
53 dpavlin 9 use Carp qw/croak confess/;
54 dpavlin 7
55 dpavlin 15 use Search::Estraier;
56     our @ISA = qw/Search::Estraier/;
57    
58 dpavlin 2 =head1 Search::Estraier::Document
59    
60 dpavlin 14 This class implements Document which is collection of attributes
61     (key=value), vectors (also key value) display text and hidden text.
62    
63 dpavlin 42
64 dpavlin 2 =head2 new
65    
66 dpavlin 14 Create new document, empty or from draft.
67    
68 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
69 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
70 dpavlin 2
71     =cut
72    
73     sub new {
74     my $class = shift;
75 dpavlin 14 my $self = {};
76 dpavlin 2 bless($self, $class);
77    
78 dpavlin 6 $self->{id} = -1;
79    
80 dpavlin 14 my $draft = shift;
81    
82     if ($draft) {
83     my $in_text = 0;
84     foreach my $line (split(/\n/, $draft)) {
85    
86     if ($in_text) {
87     if ($line =~ /^\t/) {
88     push @{ $self->{htexts} }, substr($line, 1);
89     } else {
90     push @{ $self->{dtexts} }, $line;
91     }
92     next;
93     }
94    
95     if ($line =~ m/^%VECTOR\t(.+)$/) {
96     my @fields = split(/\t/, $1);
97     for my $i ( 0 .. ($#fields - 1) ) {
98     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99     $i++;
100     }
101     next;
102     } elsif ($line =~ m/^%/) {
103     # What is this? comment?
104     #warn "$line\n";
105     next;
106     } elsif ($line =~ m/^$/) {
107     $in_text = 1;
108     next;
109     } elsif ($line =~ m/^(.+)=(.+)$/) {
110     $self->{attrs}->{ $1 } = $2;
111     next;
112     }
113    
114     warn "draft ignored: $line\n";
115     }
116     }
117    
118 dpavlin 2 $self ? return $self : return undef;
119     }
120    
121 dpavlin 4
122 dpavlin 2 =head2 add_attr
123    
124 dpavlin 6 Add an attribute.
125    
126 dpavlin 2 $doc->add_attr( name => 'value' );
127    
128 dpavlin 9 Delete attribute using
129 dpavlin 5
130     $doc->add_attr( name => undef );
131    
132 dpavlin 2 =cut
133    
134     sub add_attr {
135     my $self = shift;
136     my $attrs = {@_};
137    
138     while (my ($name, $value) = each %{ $attrs }) {
139 dpavlin 9 if (! defined($value)) {
140 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
141 dpavlin 9 } else {
142 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 dpavlin 9 }
144 dpavlin 2 }
145 dpavlin 8
146     return 1;
147 dpavlin 2 }
148    
149 dpavlin 5
150     =head2 add_text
151    
152 dpavlin 6 Add a sentence of text.
153    
154 dpavlin 5 $doc->add_text('this is example text to display');
155    
156     =cut
157    
158     sub add_text {
159     my $self = shift;
160     my $text = shift;
161     return unless defined($text);
162    
163 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
164 dpavlin 5 }
165    
166    
167     =head2 add_hidden_text
168    
169 dpavlin 6 Add a hidden sentence.
170    
171 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
172    
173     =cut
174    
175     sub add_hidden_text {
176     my $self = shift;
177     my $text = shift;
178     return unless defined($text);
179    
180 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
181 dpavlin 5 }
182    
183 dpavlin 42
184 dpavlin 6 =head2 id
185    
186     Get the ID number of document. If the object has never been registred, C<-1> is returned.
187    
188     print $doc->id;
189    
190     =cut
191    
192     sub id {
193     my $self = shift;
194     return $self->{id};
195     }
196    
197 dpavlin 42
198 dpavlin 7 =head2 attr_names
199    
200 dpavlin 9 Returns array with attribute names from document object.
201 dpavlin 7
202     my @attrs = $doc->attr_names;
203    
204     =cut
205    
206     sub attr_names {
207     my $self = shift;
208 dpavlin 9 croak "attr_names return array, not scalar" if (! wantarray);
209 dpavlin 7 return sort keys %{ $self->{attrs} };
210     }
211    
212 dpavlin 8
213     =head2 attr
214    
215 dpavlin 9 Returns value of an attribute.
216 dpavlin 8
217     my $value = $doc->attr( 'attribute' );
218    
219     =cut
220    
221     sub attr {
222     my $self = shift;
223     my $name = shift;
224    
225     return $self->{'attrs'}->{ $name };
226     }
227    
228 dpavlin 9
229     =head2 texts
230    
231     Returns array with text sentences.
232    
233     my @texts = $doc->texts;
234    
235     =cut
236    
237     sub texts {
238     my $self = shift;
239 dpavlin 12 confess "texts return array, not scalar" if (! wantarray);
240 dpavlin 11 return @{ $self->{dtexts} };
241 dpavlin 9 }
242    
243 dpavlin 42
244 dpavlin 12 =head2 cat_texts
245    
246     Return whole text as single scalar.
247    
248     my $text = $doc->cat_texts;
249    
250     =cut
251    
252     sub cat_texts {
253     my $self = shift;
254     return join(' ',@{ $self->{dtexts} });
255     }
256    
257 dpavlin 42
258 dpavlin 5 =head2 dump_draft
259    
260 dpavlin 13 Dump draft data from document object.
261    
262 dpavlin 5 print $doc->dump_draft;
263    
264     =cut
265    
266     sub dump_draft {
267 dpavlin 13 my $self = shift;
268     my $draft;
269    
270     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
271     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
272     }
273    
274     if ($self->{kwords}) {
275     $draft .= '%%VECTOR';
276     while (my ($key, $value) = each %{ $self->{kwords} }) {
277     $draft .= "\t$key\t$value";
278     }
279     $draft .= "\n";
280     }
281    
282     $draft .= "\n";
283    
284 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
285     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
286 dpavlin 13
287     return $draft;
288 dpavlin 5 }
289    
290 dpavlin 42
291 dpavlin 4 =head2 delete
292 dpavlin 2
293 dpavlin 4 Empty document object
294 dpavlin 2
295 dpavlin 4 $doc->delete;
296    
297 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
298     convinience. Document objects which go out of scope will be destroyed
299     automatically.
300    
301 dpavlin 4 =cut
302    
303     sub delete {
304     my $self = shift;
305    
306 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
307 dpavlin 5 delete($self->{$data});
308     }
309 dpavlin 4
310 dpavlin 10 $self->{id} = -1;
311    
312 dpavlin 4 return 1;
313     }
314    
315    
316    
317 dpavlin 15 package Search::Estraier::Condition;
318 dpavlin 4
319 dpavlin 16 use Carp qw/confess croak/;
320    
321 dpavlin 15 use Search::Estraier;
322     our @ISA = qw/Search::Estraier/;
323 dpavlin 4
324 dpavlin 16 =head1 Search::Estraier::Condition
325    
326     =head2 new
327    
328     my $cond = new Search::HyperEstraier::Condition;
329    
330     =cut
331    
332     sub new {
333     my $class = shift;
334     my $self = {};
335     bless($self, $class);
336    
337 dpavlin 19 $self->{max} = -1;
338     $self->{options} = 0;
339    
340 dpavlin 16 $self ? return $self : return undef;
341     }
342    
343 dpavlin 42
344 dpavlin 16 =head2 set_phrase
345    
346     $cond->set_phrase('search phrase');
347    
348     =cut
349    
350     sub set_phrase {
351     my $self = shift;
352     $self->{phrase} = $self->_s( shift );
353     }
354    
355 dpavlin 42
356 dpavlin 16 =head2 add_attr
357    
358     $cond->add_attr('@URI STRINC /~dpavlin/');
359    
360     =cut
361    
362     sub add_attr {
363     my $self = shift;
364     my $attr = shift || return;
365     push @{ $self->{attrs} }, $self->_s( $attr );
366     }
367    
368 dpavlin 42
369 dpavlin 16 =head2 set_order
370    
371     $cond->set_order('@mdate NUMD');
372    
373     =cut
374    
375     sub set_order {
376     my $self = shift;
377     $self->{order} = shift;
378     }
379    
380 dpavlin 42
381 dpavlin 16 =head2 set_max
382    
383     $cond->set_max(42);
384    
385     =cut
386    
387     sub set_max {
388     my $self = shift;
389     my $max = shift;
390 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
391 dpavlin 16 $self->{max} = $max;
392     }
393    
394 dpavlin 42
395 dpavlin 16 =head2 set_options
396    
397     $cond->set_options( SURE => 1 );
398    
399     =cut
400    
401 dpavlin 15 my $options = {
402     # check N-gram keys skipping by three
403     SURE => 1 << 0,
404     # check N-gram keys skipping by two
405     USUAL => 1 << 1,
406     # without TF-IDF tuning
407     FAST => 1 << 2,
408     # with the simplified phrase
409     AGITO => 1 << 3,
410     # check every N-gram key
411     NOIDF => 1 << 4,
412     # check N-gram keys skipping by one
413     SIMPLE => 1 << 10,
414     };
415    
416 dpavlin 16 sub set_options {
417     my $self = shift;
418     my $option = shift;
419     confess "unknown option" unless ($options->{$option});
420     $self->{options} ||= $options->{$option};
421 dpavlin 4 }
422    
423 dpavlin 42
424 dpavlin 18 =head2 phrase
425    
426     Return search phrase.
427    
428     print $cond->phrase;
429    
430     =cut
431    
432     sub phrase {
433     my $self = shift;
434     return $self->{phrase};
435     }
436    
437 dpavlin 42
438 dpavlin 19 =head2 order
439 dpavlin 18
440 dpavlin 19 Return search result order.
441    
442     print $cond->order;
443    
444     =cut
445    
446     sub order {
447     my $self = shift;
448     return $self->{order};
449     }
450    
451 dpavlin 42
452 dpavlin 19 =head2 attrs
453    
454     Return search result attrs.
455    
456     my @cond_attrs = $cond->attrs;
457    
458     =cut
459    
460     sub attrs {
461     my $self = shift;
462     #croak "attrs return array, not scalar" if (! wantarray);
463     return @{ $self->{attrs} };
464     }
465    
466 dpavlin 42
467 dpavlin 19 =head2 max
468    
469     Return maximum number of results.
470    
471     print $cond->max;
472    
473     C<-1> is returned for unitialized value, C<0> is unlimited.
474    
475     =cut
476    
477     sub max {
478     my $self = shift;
479     return $self->{max};
480     }
481    
482 dpavlin 42
483 dpavlin 19 =head2 options
484    
485     Return options for this condition.
486    
487     print $cond->options;
488    
489     Options are returned in numerical form.
490    
491     =cut
492    
493     sub options {
494     my $self = shift;
495     return $self->{options};
496     }
497    
498    
499 dpavlin 20 package Search::Estraier::ResultDocument;
500    
501 dpavlin 24 use Carp qw/croak/;
502 dpavlin 20
503 dpavlin 24 #use Search::Estraier;
504     #our @ISA = qw/Search::Estraier/;
505 dpavlin 20
506     =head1 Search::Estraier::ResultDocument
507    
508     =head2 new
509    
510 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
511 dpavlin 20 uri => 'http://localhost/document/uri/42',
512     attrs => {
513     foo => 1,
514     bar => 2,
515     },
516     snippet => 'this is a text of snippet'
517     keywords => 'this\tare\tkeywords'
518     );
519    
520     =cut
521    
522     sub new {
523     my $class = shift;
524     my $self = {@_};
525     bless($self, $class);
526    
527     foreach my $f (qw/uri attrs snippet keywords/) {
528     croak "missing $f for ResultDocument" unless defined($self->{$f});
529     }
530    
531     $self ? return $self : return undef;
532     }
533    
534 dpavlin 42
535 dpavlin 23 =head2 uri
536 dpavlin 20
537 dpavlin 23 Return URI of result document
538 dpavlin 20
539 dpavlin 23 print $rdoc->uri;
540    
541     =cut
542    
543     sub uri {
544     my $self = shift;
545     return $self->{uri};
546     }
547    
548    
549     =head2 attr_names
550    
551     Returns array with attribute names from result document object.
552    
553     my @attrs = $rdoc->attr_names;
554    
555     =cut
556    
557     sub attr_names {
558     my $self = shift;
559     croak "attr_names return array, not scalar" if (! wantarray);
560     return sort keys %{ $self->{attrs} };
561     }
562    
563 dpavlin 42
564 dpavlin 23 =head2 attr
565    
566     Returns value of an attribute.
567    
568     my $value = $rdoc->attr( 'attribute' );
569    
570     =cut
571    
572     sub attr {
573     my $self = shift;
574     my $name = shift || return;
575     return $self->{attrs}->{ $name };
576     }
577    
578 dpavlin 42
579 dpavlin 23 =head2 snippet
580    
581     Return snippet from result document
582    
583     print $rdoc->snippet;
584    
585     =cut
586    
587     sub snippet {
588     my $self = shift;
589     return $self->{snippet};
590     }
591    
592 dpavlin 42
593 dpavlin 23 =head2 keywords
594    
595     Return keywords from result document
596    
597     print $rdoc->keywords;
598    
599     =cut
600    
601     sub keywords {
602     my $self = shift;
603     return $self->{keywords};
604     }
605    
606    
607 dpavlin 25 package Search::Estraier::NodeResult;
608    
609     use Carp qw/croak/;
610    
611     #use Search::Estraier;
612     #our @ISA = qw/Search::Estraier/;
613    
614     =head1 Search::Estraier::NodeResult
615    
616     =head2 new
617    
618     my $res = new Search::HyperEstraier::NodeResult(
619     docs => @array_of_rdocs,
620     hits => %hash_with_hints,
621     );
622    
623     =cut
624    
625     sub new {
626     my $class = shift;
627     my $self = {@_};
628     bless($self, $class);
629    
630     foreach my $f (qw/docs hints/) {
631     croak "missing $f for ResultDocument" unless defined($self->{$f});
632     }
633    
634     $self ? return $self : return undef;
635     }
636    
637 dpavlin 42
638 dpavlin 25 =head2 doc_num
639    
640     Return number of documents
641    
642     print $res->doc_num;
643    
644     =cut
645    
646     sub doc_num {
647     my $self = shift;
648     return $#{$self->{docs}};
649     }
650    
651 dpavlin 42
652 dpavlin 25 =head2 get_doc
653    
654     Return single document
655    
656     my $doc = $res->get_doc( 42 );
657    
658     Returns undef if document doesn't exist.
659    
660     =cut
661    
662     sub get_doc {
663     my $self = shift;
664     my $num = shift;
665 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
666 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
667     return $self->{docs}->[$num];
668     }
669    
670 dpavlin 42
671 dpavlin 25 =head2 hint
672    
673     Return specific hint from results.
674    
675     print $rec->hint( 'VERSION' );
676    
677     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
678     C<TIME>, C<LINK#n>, C<VIEW>.
679    
680     =cut
681    
682     sub hint {
683     my $self = shift;
684     my $key = shift || return;
685     return $self->{hints}->{$key};
686     }
687    
688    
689 dpavlin 27 package Search::Estraier::Node;
690    
691 dpavlin 44 use Carp qw/carp croak confess/;
692 dpavlin 33 use URI;
693 dpavlin 36 use MIME::Base64;
694 dpavlin 33 use IO::Socket::INET;
695 dpavlin 29
696 dpavlin 27 =head1 Search::Estraier::Node
697    
698     =head2 new
699    
700     my $node = new Search::HyperEstraier::Node;
701    
702     =cut
703    
704     sub new {
705     my $class = shift;
706     my $self = {
707     pxport => -1,
708 dpavlin 33 timeout => 0, # this used to be -1
709 dpavlin 27 dnum => -1,
710     wnum => -1,
711     size => -1.0,
712     wwidth => 480,
713     hwidth => 96,
714     awidth => 96,
715     status => -1,
716     };
717     bless($self, $class);
718    
719 dpavlin 39 if (@_) {
720 dpavlin 41 $self->{debug} = shift;
721 dpavlin 39 warn "## Node debug on\n";
722     }
723    
724 dpavlin 27 $self ? return $self : return undef;
725     }
726    
727 dpavlin 42
728 dpavlin 29 =head2 set_url
729    
730     Specify URL to node server
731    
732     $node->set_url('http://localhost:1978');
733    
734     =cut
735    
736     sub set_url {
737     my $self = shift;
738     $self->{url} = shift;
739     }
740    
741 dpavlin 42
742 dpavlin 29 =head2 set_proxy
743    
744     Specify proxy server to connect to node server
745    
746     $node->set_proxy('proxy.example.com', 8080);
747    
748     =cut
749    
750     sub set_proxy {
751     my $self = shift;
752     my ($host,$port) = @_;
753 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
754 dpavlin 29 $self->{pxhost} = $host;
755     $self->{pxport} = $port;
756     }
757    
758 dpavlin 42
759 dpavlin 30 =head2 set_timeout
760    
761     Specify timeout of connection in seconds
762    
763     $node->set_timeout( 15 );
764    
765     =cut
766    
767     sub set_timeout {
768     my $self = shift;
769     my $sec = shift;
770 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
771 dpavlin 30 $self->{timeout} = $sec;
772     }
773    
774 dpavlin 42
775 dpavlin 31 =head2 set_auth
776    
777     Specify name and password for authentication to node server.
778    
779     $node->set_auth('clint','eastwood');
780    
781     =cut
782    
783     sub set_auth {
784     my $self = shift;
785     my ($login,$passwd) = @_;
786 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
787     chomp($basic_auth);
788     $self->{auth} = $basic_auth;
789 dpavlin 31 }
790    
791 dpavlin 42
792 dpavlin 32 =head2 status
793    
794     Return status code of last request.
795    
796 dpavlin 40 print $node->status;
797 dpavlin 32
798     C<-1> means connection failure.
799    
800     =cut
801    
802     sub status {
803     my $self = shift;
804     return $self->{status};
805     }
806    
807 dpavlin 42
808 dpavlin 40 =head2 put_doc
809    
810 dpavlin 41 Add a document
811 dpavlin 40
812 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
813    
814     Return true on success or false on failture.
815    
816 dpavlin 40 =cut
817    
818     sub put_doc {
819     my $self = shift;
820     my $doc = shift || return;
821 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
822 dpavlin 41 $self->shuttle_url( $self->{url} . '/put_doc',
823     'text/x-estraier-draft',
824     $doc->dump_draft,
825     undef
826     ) == 200;
827 dpavlin 40 }
828    
829 dpavlin 41
830     =head2 out_doc
831    
832     Remove a document
833    
834     $node->out_doc( document_id ) or "can't remove document";
835    
836     Return true on success or false on failture.
837    
838     =cut
839    
840     sub out_doc {
841     my $self = shift;
842     my $id = shift || return;
843     return unless ($self->{url});
844 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
845 dpavlin 41 $self->shuttle_url( $self->{url} . '/out_doc',
846     'application/x-www-form-urlencoded',
847     "id=$id",
848     undef
849     ) == 200;
850     }
851    
852    
853     =head2 out_doc_by_uri
854    
855     Remove a registrated document using it's uri
856    
857 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
858 dpavlin 41
859     Return true on success or false on failture.
860    
861     =cut
862    
863     sub out_doc_by_uri {
864     my $self = shift;
865     my $uri = shift || return;
866     return unless ($self->{url});
867     $self->shuttle_url( $self->{url} . '/out_doc',
868     'application/x-www-form-urlencoded',
869     "uri=$uri",
870     undef
871     ) == 200;
872     }
873    
874 dpavlin 42
875     =head2 edit_doc
876    
877     Edit attributes of a document
878    
879     $node->edit_doc( $document_draft ) or die "can't edit document";
880    
881     Return true on success or false on failture.
882    
883     =cut
884    
885     sub edit_doc {
886     my $self = shift;
887     my $doc = shift || return;
888 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
889 dpavlin 42 $self->shuttle_url( $self->{url} . '/edit_doc',
890     'text/x-estraier-draft',
891     $doc->dump_draft,
892     undef
893     ) == 200;
894     }
895    
896    
897 dpavlin 43 =head2 get_doc
898    
899     Retreive document
900    
901     my $doc = $node->get_doc( document_id ) or die "can't get document";
902    
903     Return true on success or false on failture.
904    
905     =cut
906    
907     sub get_doc {
908     my $self = shift;
909     my $id = shift || return;
910     return $self->_fetch_doc( id => $id );
911     }
912    
913 dpavlin 44
914 dpavlin 43 =head2 get_doc_by_uri
915    
916     Retreive document
917    
918 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
919 dpavlin 43
920     Return true on success or false on failture.
921    
922     =cut
923    
924     sub get_doc_by_uri {
925     my $self = shift;
926     my $uri = shift || return;
927     return $self->_fetch_doc( uri => $uri );
928     }
929    
930 dpavlin 44
931     =head2 etch_doc
932    
933     Exctract document keywords
934    
935     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
936    
937     =cut
938    
939     sub erch_doc {
940     my $self = shift;
941     my $id = shift || return;
942     return $self->_fetch_doc( id => $id, etch => 1 );
943     }
944    
945     =head2 etch_doc_by_uri
946    
947     Retreive document
948    
949 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
950 dpavlin 44
951     Return true on success or false on failture.
952    
953     =cut
954    
955     sub etch_doc_by_uri {
956     my $self = shift;
957     my $uri = shift || return;
958     return $self->_fetch_doc( uri => $uri, etch => 1 );
959     }
960    
961    
962 dpavlin 45 =head2 uri_to_id
963    
964     Get ID of document specified by URI
965    
966     my $id = $node->uri_to_id( 'file:///document/uri/42' );
967    
968     =cut
969    
970     sub uri_to_id {
971     my $self = shift;
972     my $uri = shift || return;
973     return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
974     }
975    
976    
977 dpavlin 43 =head2 _fetch_doc
978    
979 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
980     C<etch_doc>, C<etch_doc_by_uri>.
981 dpavlin 43
982 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
983     my $doc = $node->_fetch_doc( id => 42 );
984     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
985 dpavlin 43
986 dpavlin 45 # to extract keywords, add etch
987     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
988     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
989    
990     # more general form which allows implementation of
991     # uri_to_id
992     my $id = $node->_fetch_doc(
993     uri => 'file:///document/uri/42',
994     path => '/uri_to_id',
995     chomp_resbody => 1
996     );
997    
998 dpavlin 43 =cut
999    
1000     sub _fetch_doc {
1001     my $self = shift;
1002 dpavlin 44 my $a = {@_};
1003     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1004    
1005     my ($arg, $resbody);
1006    
1007 dpavlin 45 my $path = $a->{path} || '/get_doc';
1008 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1009    
1010     if ($a->{id}) {
1011     croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1012     $arg = 'id=' . $a->{id};
1013     } elsif ($a->{uri}) {
1014     $arg = 'uri=' . $a->{uri};
1015     } else {
1016     confess "unhandled argument. Need id or uri.";
1017 dpavlin 43 }
1018 dpavlin 44
1019     my $rv = $self->shuttle_url( $self->{url} . $path,
1020 dpavlin 43 'application/x-www-form-urlencoded',
1021 dpavlin 44 $arg,
1022 dpavlin 45 \$resbody,
1023 dpavlin 43 );
1024 dpavlin 44
1025 dpavlin 43 return if ($rv != 200);
1026 dpavlin 44
1027     if ($a->{etch}) {
1028     $self->{kwords} = {};
1029     return +{} unless ($resbody);
1030     foreach my $l (split(/\n/, $resbody)) {
1031     my ($k,$v) = split(/\t/, $l, 2);
1032     $self->{kwords}->{$k} = $v if ($v);
1033     }
1034     return $self->{kwords};
1035 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1036     return unless (defined($resbody));
1037     chomp($resbody);
1038     return $resbody;
1039 dpavlin 44 } else {
1040     return new Search::Estraier::Document($resbody);
1041     }
1042 dpavlin 43 }
1043    
1044    
1045 dpavlin 48 =head2 name
1046 dpavlin 43
1047 dpavlin 48 my $node_name = $node->name;
1048 dpavlin 43
1049 dpavlin 48 =cut
1050    
1051     sub name {
1052     my $self = shift;
1053     $self->set_info unless ($self->{name});
1054     return $self->{name};
1055     }
1056    
1057    
1058     =head2 label
1059    
1060     my $node_label = $node->label;
1061    
1062     =cut
1063    
1064     sub label {
1065     my $self = shift;
1066     $self->set_info unless ($self->{label});
1067     return $self->{label};
1068     }
1069    
1070    
1071     =head2 doc_num
1072    
1073     my $documents_in_node = $node->doc_num;
1074    
1075     =cut
1076    
1077     sub doc_num {
1078     my $self = shift;
1079     $self->set_info if ($self->{dnum} < 0);
1080     return $self->{dnum};
1081     }
1082    
1083    
1084     =head2 word_num
1085    
1086     my $words_in_node = $node->word_num;
1087    
1088     =cut
1089    
1090     sub word_num {
1091     my $self = shift;
1092     $self->set_info if ($self->{wnum} < 0);
1093     return $self->{wnum};
1094     }
1095    
1096    
1097     =head2 size
1098    
1099     my $node_size = $node->size;
1100    
1101     =cut
1102    
1103     sub size {
1104     my $self = shift;
1105     $self->set_info if ($self->{size} < 0);
1106     return $self->{size};
1107     }
1108    
1109    
1110    
1111 dpavlin 33 =head2 shuttle_url
1112 dpavlin 32
1113 dpavlin 33 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1114     master.
1115 dpavlin 2
1116 dpavlin 33 my $rv = shuttle_url( $url, $content_type, \$req_body, \$resbody );
1117 dpavlin 2
1118 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1119     body will be saved within object.
1120 dpavlin 2
1121     =cut
1122    
1123 dpavlin 33 sub shuttle_url {
1124     my $self = shift;
1125 dpavlin 2
1126 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
1127 dpavlin 2
1128 dpavlin 40 $self->{status} = -1;
1129 dpavlin 33
1130 dpavlin 41 warn "## $url\n" if ($self->{debug});
1131 dpavlin 36
1132 dpavlin 33 $url = new URI($url);
1133 dpavlin 37 if (
1134     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1135     !$url->host || !$url->port || $url->port < 1
1136     ) {
1137     carp "can't parse $url\n";
1138     return -1;
1139     }
1140 dpavlin 33
1141     my ($host,$port,$query) = ($url->host, $url->port, $url->path);
1142    
1143     if ($self->{pxhost}) {
1144     ($host,$port) = ($self->{pxhost}, $self->{pxport});
1145     $query = "http://$host:$port/$query";
1146 dpavlin 2 }
1147    
1148 dpavlin 37 $query .= '?' . $url->query if ($url->query && ! $reqbody);
1149 dpavlin 2
1150 dpavlin 37 my $headers;
1151    
1152     if ($reqbody) {
1153     $headers .= "POST $query HTTP/1.0\r\n";
1154     } else {
1155     $headers .= "GET $query HTTP/1.0\r\n";
1156     }
1157    
1158 dpavlin 40 $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n";
1159 dpavlin 37 $headers .= "Connection: close\r\n";
1160     $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";
1161 dpavlin 40 $headers .= "Content-Type: $content_type\r\n";
1162 dpavlin 37 $headers .= "Authorization: Basic $self->{auth}\r\n";
1163     my $len = 0;
1164     {
1165     use bytes;
1166     $len = length($reqbody) if ($reqbody);
1167     }
1168     $headers .= "Content-Length: $len\r\n";
1169     $headers .= "\r\n";
1170    
1171 dpavlin 33 my $sock = IO::Socket::INET->new(
1172     PeerAddr => $host,
1173     PeerPort => $port,
1174     Proto => 'tcp',
1175     Timeout => $self->{timeout} || 90,
1176 dpavlin 37 );
1177 dpavlin 2
1178 dpavlin 37 if (! $sock) {
1179     carp "can't open socket to $host:$port";
1180     return -1;
1181 dpavlin 33 }
1182 dpavlin 2
1183 dpavlin 40 warn $headers if ($self->{debug});
1184 dpavlin 39
1185 dpavlin 37 print $sock $headers or
1186     carp "can't send headers to network:\n$headers\n" and return -1;
1187    
1188     if ($reqbody) {
1189 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1190 dpavlin 40 print $sock $reqbody or
1191 dpavlin 37 carp "can't send request body to network:\n$$reqbody\n" and return -1;
1192 dpavlin 33 }
1193 dpavlin 2
1194 dpavlin 33 my $line = <$sock>;
1195     chomp($line);
1196     my ($schema, $res_status, undef) = split(/ */, $line, 3);
1197     return if ($schema !~ /^HTTP/ || ! $res_status);
1198 dpavlin 2
1199 dpavlin 40 $self->{status} = $res_status;
1200 dpavlin 39 warn "## response status: $res_status\n" if ($self->{debug});
1201 dpavlin 2
1202 dpavlin 33 # skip rest of headers
1203 dpavlin 38 $line = <$sock>;
1204     while ($line) {
1205 dpavlin 33 $line = <$sock>;
1206 dpavlin 38 $line =~ s/[\r\n]+$//;
1207 dpavlin 40 warn "## ", $line || 'NULL', " ##\n" if ($self->{debug});
1208 dpavlin 38 };
1209 dpavlin 2
1210 dpavlin 33 # read body
1211 dpavlin 38 $len = 0;
1212 dpavlin 33 do {
1213     $len = read($sock, my $buf, 8192);
1214     $$resbody .= $buf if ($resbody);
1215     } while ($len);
1216    
1217 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1218 dpavlin 39
1219 dpavlin 40 return $self->{status};
1220 dpavlin 2 }
1221    
1222 dpavlin 48
1223     =head2 set_info
1224    
1225     Set information for node
1226    
1227     $node->set_info;
1228    
1229     =cut
1230    
1231     sub set_info {
1232     my $self = shift;
1233    
1234     $self->{status} = -1;
1235     return unless ($self->{url});
1236    
1237     my $resbody;
1238     my $rv = $self->shuttle_url( $self->{url} . '/inform',
1239     'text/plain',
1240     undef,
1241     \$resbody,
1242     );
1243    
1244     return if ($rv != 200 || !$resbody);
1245    
1246     chomp($resbody);
1247    
1248     ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1249     split(/\t/, $resbody, 5);
1250    
1251     }
1252    
1253 dpavlin 2 ###
1254    
1255     =head1 EXPORT
1256    
1257     Nothing.
1258    
1259     =head1 SEE ALSO
1260    
1261     L<http://hyperestraier.sourceforge.net/>
1262    
1263     Hyper Estraier Ruby interface on which this module is based.
1264    
1265     =head1 AUTHOR
1266    
1267     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1268    
1269    
1270     =head1 COPYRIGHT AND LICENSE
1271    
1272 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1273 dpavlin 2
1274     This library is free software; you can redistribute it and/or modify
1275     it under the GPL v2 or later.
1276    
1277     =cut
1278    
1279     1;

  ViewVC Help
Powered by ViewVC 1.1.26