/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 63 - (hide annotations)
Sat Jan 7 16:19:31 2006 UTC (18 years, 3 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 27234 byte(s)
fix warning
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 59 our $VERSION = '0.01';
8 dpavlin 2
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15     use Search::Estraier;
16     my $est = new Search::Estraier();
17    
18     =head1 DESCRIPTION
19    
20     This module is implementation of node API of Hyper Estraier. Since it's
21     perl-only module with dependencies only on standard perl modules, it will
22     run on all platforms on which perl runs. It doesn't require compilation
23     or Hyper Estraier development files on target machine.
24    
25     It is implemented as multiple packages which closly resamble Ruby
26     implementation. It also includes methods to manage nodes.
27    
28     =cut
29    
30 dpavlin 42 =head1 Inheritable common methods
31    
32     This methods should really move somewhere else.
33    
34 dpavlin 15 =head2 _s
35    
36     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37    
38     my $text = $self->_s(" this is a text ");
39     $text = 'this is a text';
40    
41     =cut
42    
43     sub _s {
44     my $text = $_[1] || return;
45     $text =~ s/\s\s+/ /gs;
46     $text =~ s/^\s+//;
47     $text =~ s/\s+$//;
48     return $text;
49     }
50    
51 dpavlin 2 package Search::Estraier::Document;
52    
53 dpavlin 9 use Carp qw/croak confess/;
54 dpavlin 7
55 dpavlin 15 use Search::Estraier;
56     our @ISA = qw/Search::Estraier/;
57    
58 dpavlin 2 =head1 Search::Estraier::Document
59    
60 dpavlin 14 This class implements Document which is collection of attributes
61     (key=value), vectors (also key value) display text and hidden text.
62    
63 dpavlin 42
64 dpavlin 2 =head2 new
65    
66 dpavlin 14 Create new document, empty or from draft.
67    
68 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
69 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
70 dpavlin 2
71     =cut
72    
73     sub new {
74     my $class = shift;
75 dpavlin 14 my $self = {};
76 dpavlin 2 bless($self, $class);
77    
78 dpavlin 6 $self->{id} = -1;
79    
80 dpavlin 14 my $draft = shift;
81    
82     if ($draft) {
83     my $in_text = 0;
84     foreach my $line (split(/\n/, $draft)) {
85    
86     if ($in_text) {
87     if ($line =~ /^\t/) {
88     push @{ $self->{htexts} }, substr($line, 1);
89     } else {
90     push @{ $self->{dtexts} }, $line;
91     }
92     next;
93     }
94    
95     if ($line =~ m/^%VECTOR\t(.+)$/) {
96     my @fields = split(/\t/, $1);
97     for my $i ( 0 .. ($#fields - 1) ) {
98     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99     $i++;
100     }
101     next;
102     } elsif ($line =~ m/^%/) {
103     # What is this? comment?
104     #warn "$line\n";
105     next;
106     } elsif ($line =~ m/^$/) {
107     $in_text = 1;
108     next;
109     } elsif ($line =~ m/^(.+)=(.+)$/) {
110     $self->{attrs}->{ $1 } = $2;
111     next;
112     }
113    
114     warn "draft ignored: $line\n";
115     }
116     }
117    
118 dpavlin 2 $self ? return $self : return undef;
119     }
120    
121 dpavlin 4
122 dpavlin 2 =head2 add_attr
123    
124 dpavlin 6 Add an attribute.
125    
126 dpavlin 2 $doc->add_attr( name => 'value' );
127    
128 dpavlin 9 Delete attribute using
129 dpavlin 5
130     $doc->add_attr( name => undef );
131    
132 dpavlin 2 =cut
133    
134     sub add_attr {
135     my $self = shift;
136     my $attrs = {@_};
137    
138     while (my ($name, $value) = each %{ $attrs }) {
139 dpavlin 9 if (! defined($value)) {
140 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
141 dpavlin 9 } else {
142 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 dpavlin 9 }
144 dpavlin 2 }
145 dpavlin 8
146     return 1;
147 dpavlin 2 }
148    
149 dpavlin 5
150     =head2 add_text
151    
152 dpavlin 6 Add a sentence of text.
153    
154 dpavlin 5 $doc->add_text('this is example text to display');
155    
156     =cut
157    
158     sub add_text {
159     my $self = shift;
160     my $text = shift;
161     return unless defined($text);
162    
163 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
164 dpavlin 5 }
165    
166    
167     =head2 add_hidden_text
168    
169 dpavlin 6 Add a hidden sentence.
170    
171 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
172    
173     =cut
174    
175     sub add_hidden_text {
176     my $self = shift;
177     my $text = shift;
178     return unless defined($text);
179    
180 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
181 dpavlin 5 }
182    
183 dpavlin 42
184 dpavlin 6 =head2 id
185    
186     Get the ID number of document. If the object has never been registred, C<-1> is returned.
187    
188     print $doc->id;
189    
190     =cut
191    
192     sub id {
193     my $self = shift;
194     return $self->{id};
195     }
196    
197 dpavlin 42
198 dpavlin 7 =head2 attr_names
199    
200 dpavlin 9 Returns array with attribute names from document object.
201 dpavlin 7
202     my @attrs = $doc->attr_names;
203    
204     =cut
205    
206     sub attr_names {
207     my $self = shift;
208 dpavlin 60 return unless ($self->{attrs});
209     #croak "attr_names return array, not scalar" if (! wantarray);
210 dpavlin 7 return sort keys %{ $self->{attrs} };
211     }
212    
213 dpavlin 8
214     =head2 attr
215    
216 dpavlin 9 Returns value of an attribute.
217 dpavlin 8
218     my $value = $doc->attr( 'attribute' );
219    
220     =cut
221    
222     sub attr {
223     my $self = shift;
224     my $name = shift;
225 dpavlin 60 return unless (defined($name) && $self->{attrs});
226     return $self->{attrs}->{ $name };
227 dpavlin 8 }
228    
229 dpavlin 9
230     =head2 texts
231    
232     Returns array with text sentences.
233    
234     my @texts = $doc->texts;
235    
236     =cut
237    
238     sub texts {
239     my $self = shift;
240 dpavlin 60 #confess "texts return array, not scalar" if (! wantarray);
241     return @{ $self->{dtexts} } if ($self->{dtexts});
242 dpavlin 9 }
243    
244 dpavlin 42
245 dpavlin 12 =head2 cat_texts
246    
247     Return whole text as single scalar.
248    
249     my $text = $doc->cat_texts;
250    
251     =cut
252    
253     sub cat_texts {
254     my $self = shift;
255 dpavlin 60 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
256 dpavlin 12 }
257    
258 dpavlin 42
259 dpavlin 5 =head2 dump_draft
260    
261 dpavlin 13 Dump draft data from document object.
262    
263 dpavlin 5 print $doc->dump_draft;
264    
265     =cut
266    
267     sub dump_draft {
268 dpavlin 13 my $self = shift;
269     my $draft;
270    
271     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
272     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
273     }
274    
275     if ($self->{kwords}) {
276     $draft .= '%%VECTOR';
277     while (my ($key, $value) = each %{ $self->{kwords} }) {
278     $draft .= "\t$key\t$value";
279     }
280     $draft .= "\n";
281     }
282    
283     $draft .= "\n";
284    
285 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
286     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
287 dpavlin 13
288     return $draft;
289 dpavlin 5 }
290    
291 dpavlin 42
292 dpavlin 4 =head2 delete
293 dpavlin 2
294 dpavlin 4 Empty document object
295 dpavlin 2
296 dpavlin 4 $doc->delete;
297    
298 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
299     convinience. Document objects which go out of scope will be destroyed
300     automatically.
301    
302 dpavlin 4 =cut
303    
304     sub delete {
305     my $self = shift;
306    
307 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
308 dpavlin 5 delete($self->{$data});
309     }
310 dpavlin 4
311 dpavlin 10 $self->{id} = -1;
312    
313 dpavlin 4 return 1;
314     }
315    
316    
317    
318 dpavlin 15 package Search::Estraier::Condition;
319 dpavlin 4
320 dpavlin 16 use Carp qw/confess croak/;
321    
322 dpavlin 15 use Search::Estraier;
323     our @ISA = qw/Search::Estraier/;
324 dpavlin 4
325 dpavlin 16 =head1 Search::Estraier::Condition
326    
327     =head2 new
328    
329     my $cond = new Search::HyperEstraier::Condition;
330    
331     =cut
332    
333     sub new {
334     my $class = shift;
335     my $self = {};
336     bless($self, $class);
337    
338 dpavlin 19 $self->{max} = -1;
339     $self->{options} = 0;
340    
341 dpavlin 16 $self ? return $self : return undef;
342     }
343    
344 dpavlin 42
345 dpavlin 16 =head2 set_phrase
346    
347     $cond->set_phrase('search phrase');
348    
349     =cut
350    
351     sub set_phrase {
352     my $self = shift;
353     $self->{phrase} = $self->_s( shift );
354     }
355    
356 dpavlin 42
357 dpavlin 16 =head2 add_attr
358    
359     $cond->add_attr('@URI STRINC /~dpavlin/');
360    
361     =cut
362    
363     sub add_attr {
364     my $self = shift;
365     my $attr = shift || return;
366     push @{ $self->{attrs} }, $self->_s( $attr );
367     }
368    
369 dpavlin 42
370 dpavlin 16 =head2 set_order
371    
372     $cond->set_order('@mdate NUMD');
373    
374     =cut
375    
376     sub set_order {
377     my $self = shift;
378     $self->{order} = shift;
379     }
380    
381 dpavlin 42
382 dpavlin 16 =head2 set_max
383    
384     $cond->set_max(42);
385    
386     =cut
387    
388     sub set_max {
389     my $self = shift;
390     my $max = shift;
391 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
392 dpavlin 16 $self->{max} = $max;
393     }
394    
395 dpavlin 42
396 dpavlin 16 =head2 set_options
397    
398     $cond->set_options( SURE => 1 );
399    
400     =cut
401    
402 dpavlin 15 my $options = {
403     # check N-gram keys skipping by three
404     SURE => 1 << 0,
405     # check N-gram keys skipping by two
406     USUAL => 1 << 1,
407     # without TF-IDF tuning
408     FAST => 1 << 2,
409     # with the simplified phrase
410     AGITO => 1 << 3,
411     # check every N-gram key
412     NOIDF => 1 << 4,
413     # check N-gram keys skipping by one
414     SIMPLE => 1 << 10,
415     };
416    
417 dpavlin 16 sub set_options {
418     my $self = shift;
419     my $option = shift;
420     confess "unknown option" unless ($options->{$option});
421     $self->{options} ||= $options->{$option};
422 dpavlin 4 }
423    
424 dpavlin 42
425 dpavlin 18 =head2 phrase
426    
427     Return search phrase.
428    
429     print $cond->phrase;
430    
431     =cut
432    
433     sub phrase {
434     my $self = shift;
435     return $self->{phrase};
436     }
437    
438 dpavlin 42
439 dpavlin 19 =head2 order
440 dpavlin 18
441 dpavlin 19 Return search result order.
442    
443     print $cond->order;
444    
445     =cut
446    
447     sub order {
448     my $self = shift;
449     return $self->{order};
450     }
451    
452 dpavlin 42
453 dpavlin 19 =head2 attrs
454    
455     Return search result attrs.
456    
457     my @cond_attrs = $cond->attrs;
458    
459     =cut
460    
461     sub attrs {
462     my $self = shift;
463     #croak "attrs return array, not scalar" if (! wantarray);
464 dpavlin 60 return @{ $self->{attrs} } if ($self->{attrs});
465 dpavlin 19 }
466    
467 dpavlin 42
468 dpavlin 19 =head2 max
469    
470     Return maximum number of results.
471    
472     print $cond->max;
473    
474     C<-1> is returned for unitialized value, C<0> is unlimited.
475    
476     =cut
477    
478     sub max {
479     my $self = shift;
480     return $self->{max};
481     }
482    
483 dpavlin 42
484 dpavlin 19 =head2 options
485    
486     Return options for this condition.
487    
488     print $cond->options;
489    
490     Options are returned in numerical form.
491    
492     =cut
493    
494     sub options {
495     my $self = shift;
496     return $self->{options};
497     }
498    
499    
500 dpavlin 20 package Search::Estraier::ResultDocument;
501    
502 dpavlin 24 use Carp qw/croak/;
503 dpavlin 20
504 dpavlin 24 #use Search::Estraier;
505     #our @ISA = qw/Search::Estraier/;
506 dpavlin 20
507     =head1 Search::Estraier::ResultDocument
508    
509     =head2 new
510    
511 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
512 dpavlin 20 uri => 'http://localhost/document/uri/42',
513     attrs => {
514     foo => 1,
515     bar => 2,
516     },
517     snippet => 'this is a text of snippet'
518     keywords => 'this\tare\tkeywords'
519     );
520    
521     =cut
522    
523     sub new {
524     my $class = shift;
525     my $self = {@_};
526     bless($self, $class);
527    
528 dpavlin 62 croak "missing uri for ResultDocument" unless defined($self->{uri});
529 dpavlin 20
530     $self ? return $self : return undef;
531     }
532    
533 dpavlin 42
534 dpavlin 23 =head2 uri
535 dpavlin 20
536 dpavlin 23 Return URI of result document
537 dpavlin 20
538 dpavlin 23 print $rdoc->uri;
539    
540     =cut
541    
542     sub uri {
543     my $self = shift;
544     return $self->{uri};
545     }
546    
547    
548     =head2 attr_names
549    
550     Returns array with attribute names from result document object.
551    
552     my @attrs = $rdoc->attr_names;
553    
554     =cut
555    
556     sub attr_names {
557     my $self = shift;
558     croak "attr_names return array, not scalar" if (! wantarray);
559     return sort keys %{ $self->{attrs} };
560     }
561    
562 dpavlin 42
563 dpavlin 23 =head2 attr
564    
565     Returns value of an attribute.
566    
567     my $value = $rdoc->attr( 'attribute' );
568    
569     =cut
570    
571     sub attr {
572     my $self = shift;
573     my $name = shift || return;
574     return $self->{attrs}->{ $name };
575     }
576    
577 dpavlin 42
578 dpavlin 23 =head2 snippet
579    
580     Return snippet from result document
581    
582     print $rdoc->snippet;
583    
584     =cut
585    
586     sub snippet {
587     my $self = shift;
588     return $self->{snippet};
589     }
590    
591 dpavlin 42
592 dpavlin 23 =head2 keywords
593    
594     Return keywords from result document
595    
596     print $rdoc->keywords;
597    
598     =cut
599    
600     sub keywords {
601     my $self = shift;
602     return $self->{keywords};
603     }
604    
605    
606 dpavlin 25 package Search::Estraier::NodeResult;
607    
608     use Carp qw/croak/;
609    
610     #use Search::Estraier;
611     #our @ISA = qw/Search::Estraier/;
612    
613     =head1 Search::Estraier::NodeResult
614    
615     =head2 new
616    
617     my $res = new Search::HyperEstraier::NodeResult(
618     docs => @array_of_rdocs,
619     hits => %hash_with_hints,
620     );
621    
622     =cut
623    
624     sub new {
625     my $class = shift;
626     my $self = {@_};
627     bless($self, $class);
628    
629     foreach my $f (qw/docs hints/) {
630     croak "missing $f for ResultDocument" unless defined($self->{$f});
631     }
632    
633     $self ? return $self : return undef;
634     }
635    
636 dpavlin 42
637 dpavlin 25 =head2 doc_num
638    
639     Return number of documents
640    
641     print $res->doc_num;
642    
643     =cut
644    
645     sub doc_num {
646     my $self = shift;
647 dpavlin 53 return $#{$self->{docs}} + 1;
648 dpavlin 25 }
649    
650 dpavlin 42
651 dpavlin 25 =head2 get_doc
652    
653     Return single document
654    
655     my $doc = $res->get_doc( 42 );
656    
657     Returns undef if document doesn't exist.
658    
659     =cut
660    
661     sub get_doc {
662     my $self = shift;
663     my $num = shift;
664 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
665 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
666     return $self->{docs}->[$num];
667     }
668    
669 dpavlin 42
670 dpavlin 25 =head2 hint
671    
672     Return specific hint from results.
673    
674     print $rec->hint( 'VERSION' );
675    
676     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
677     C<TIME>, C<LINK#n>, C<VIEW>.
678    
679     =cut
680    
681     sub hint {
682     my $self = shift;
683     my $key = shift || return;
684     return $self->{hints}->{$key};
685     }
686    
687    
688 dpavlin 27 package Search::Estraier::Node;
689    
690 dpavlin 44 use Carp qw/carp croak confess/;
691 dpavlin 33 use URI;
692 dpavlin 36 use MIME::Base64;
693 dpavlin 33 use IO::Socket::INET;
694 dpavlin 49 use URI::Escape qw/uri_escape/;
695 dpavlin 29
696 dpavlin 27 =head1 Search::Estraier::Node
697    
698     =head2 new
699    
700     my $node = new Search::HyperEstraier::Node;
701    
702     =cut
703    
704     sub new {
705     my $class = shift;
706     my $self = {
707     pxport => -1,
708 dpavlin 33 timeout => 0, # this used to be -1
709 dpavlin 27 dnum => -1,
710     wnum => -1,
711     size => -1.0,
712     wwidth => 480,
713     hwidth => 96,
714     awidth => 96,
715     status => -1,
716     };
717     bless($self, $class);
718    
719 dpavlin 57 my $args = {@_};
720 dpavlin 39
721 dpavlin 57 $self->{debug} = $args->{debug};
722     warn "## Node debug on\n" if ($self->{debug});
723    
724 dpavlin 27 $self ? return $self : return undef;
725     }
726    
727 dpavlin 42
728 dpavlin 29 =head2 set_url
729    
730     Specify URL to node server
731    
732     $node->set_url('http://localhost:1978');
733    
734     =cut
735    
736     sub set_url {
737     my $self = shift;
738     $self->{url} = shift;
739     }
740    
741 dpavlin 42
742 dpavlin 29 =head2 set_proxy
743    
744     Specify proxy server to connect to node server
745    
746     $node->set_proxy('proxy.example.com', 8080);
747    
748     =cut
749    
750     sub set_proxy {
751     my $self = shift;
752     my ($host,$port) = @_;
753 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
754 dpavlin 29 $self->{pxhost} = $host;
755     $self->{pxport} = $port;
756     }
757    
758 dpavlin 42
759 dpavlin 30 =head2 set_timeout
760    
761     Specify timeout of connection in seconds
762    
763     $node->set_timeout( 15 );
764    
765     =cut
766    
767     sub set_timeout {
768     my $self = shift;
769     my $sec = shift;
770 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
771 dpavlin 30 $self->{timeout} = $sec;
772     }
773    
774 dpavlin 42
775 dpavlin 31 =head2 set_auth
776    
777     Specify name and password for authentication to node server.
778    
779     $node->set_auth('clint','eastwood');
780    
781     =cut
782    
783     sub set_auth {
784     my $self = shift;
785     my ($login,$passwd) = @_;
786 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
787     chomp($basic_auth);
788     $self->{auth} = $basic_auth;
789 dpavlin 31 }
790    
791 dpavlin 42
792 dpavlin 32 =head2 status
793    
794     Return status code of last request.
795    
796 dpavlin 40 print $node->status;
797 dpavlin 32
798     C<-1> means connection failure.
799    
800     =cut
801    
802     sub status {
803     my $self = shift;
804     return $self->{status};
805     }
806    
807 dpavlin 42
808 dpavlin 40 =head2 put_doc
809    
810 dpavlin 41 Add a document
811 dpavlin 40
812 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
813    
814     Return true on success or false on failture.
815    
816 dpavlin 40 =cut
817    
818     sub put_doc {
819     my $self = shift;
820     my $doc = shift || return;
821 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
822 dpavlin 41 $self->shuttle_url( $self->{url} . '/put_doc',
823     'text/x-estraier-draft',
824     $doc->dump_draft,
825     undef
826     ) == 200;
827 dpavlin 40 }
828    
829 dpavlin 41
830     =head2 out_doc
831    
832     Remove a document
833    
834     $node->out_doc( document_id ) or "can't remove document";
835    
836     Return true on success or false on failture.
837    
838     =cut
839    
840     sub out_doc {
841     my $self = shift;
842     my $id = shift || return;
843     return unless ($self->{url});
844 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
845 dpavlin 41 $self->shuttle_url( $self->{url} . '/out_doc',
846     'application/x-www-form-urlencoded',
847     "id=$id",
848     undef
849     ) == 200;
850     }
851    
852    
853     =head2 out_doc_by_uri
854    
855     Remove a registrated document using it's uri
856    
857 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
858 dpavlin 41
859     Return true on success or false on failture.
860    
861     =cut
862    
863     sub out_doc_by_uri {
864     my $self = shift;
865     my $uri = shift || return;
866     return unless ($self->{url});
867     $self->shuttle_url( $self->{url} . '/out_doc',
868     'application/x-www-form-urlencoded',
869 dpavlin 50 "uri=" . uri_escape($uri),
870 dpavlin 41 undef
871     ) == 200;
872     }
873    
874 dpavlin 42
875     =head2 edit_doc
876    
877     Edit attributes of a document
878    
879     $node->edit_doc( $document_draft ) or die "can't edit document";
880    
881     Return true on success or false on failture.
882    
883     =cut
884    
885     sub edit_doc {
886     my $self = shift;
887     my $doc = shift || return;
888 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
889 dpavlin 42 $self->shuttle_url( $self->{url} . '/edit_doc',
890     'text/x-estraier-draft',
891     $doc->dump_draft,
892     undef
893     ) == 200;
894     }
895    
896    
897 dpavlin 43 =head2 get_doc
898    
899     Retreive document
900    
901     my $doc = $node->get_doc( document_id ) or die "can't get document";
902    
903     Return true on success or false on failture.
904    
905     =cut
906    
907     sub get_doc {
908     my $self = shift;
909     my $id = shift || return;
910     return $self->_fetch_doc( id => $id );
911     }
912    
913 dpavlin 44
914 dpavlin 43 =head2 get_doc_by_uri
915    
916     Retreive document
917    
918 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
919 dpavlin 43
920     Return true on success or false on failture.
921    
922     =cut
923    
924     sub get_doc_by_uri {
925     my $self = shift;
926     my $uri = shift || return;
927     return $self->_fetch_doc( uri => $uri );
928     }
929    
930 dpavlin 44
931 dpavlin 49 =head2 get_doc_attr
932    
933     Retrieve the value of an atribute from object
934    
935     my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
936     die "can't get document attribute";
937    
938     =cut
939    
940     sub get_doc_attr {
941     my $self = shift;
942     my ($id,$name) = @_;
943     return unless ($id && $name);
944     return $self->_fetch_doc( id => $id, attr => $name );
945     }
946    
947    
948     =head2 get_doc_attr_by_uri
949    
950     Retrieve the value of an atribute from object
951    
952     my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
953     die "can't get document attribute";
954    
955     =cut
956    
957     sub get_doc_attr_by_uri {
958     my $self = shift;
959     my ($uri,$name) = @_;
960     return unless ($uri && $name);
961     return $self->_fetch_doc( uri => $uri, attr => $name );
962     }
963    
964    
965 dpavlin 44 =head2 etch_doc
966    
967     Exctract document keywords
968    
969     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
970    
971     =cut
972    
973 dpavlin 49 sub etch_doc {
974 dpavlin 44 my $self = shift;
975     my $id = shift || return;
976     return $self->_fetch_doc( id => $id, etch => 1 );
977     }
978    
979     =head2 etch_doc_by_uri
980    
981     Retreive document
982    
983 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
984 dpavlin 44
985     Return true on success or false on failture.
986    
987     =cut
988    
989     sub etch_doc_by_uri {
990     my $self = shift;
991     my $uri = shift || return;
992     return $self->_fetch_doc( uri => $uri, etch => 1 );
993     }
994    
995    
996 dpavlin 45 =head2 uri_to_id
997    
998     Get ID of document specified by URI
999    
1000     my $id = $node->uri_to_id( 'file:///document/uri/42' );
1001    
1002     =cut
1003    
1004     sub uri_to_id {
1005     my $self = shift;
1006     my $uri = shift || return;
1007     return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1008     }
1009    
1010    
1011 dpavlin 43 =head2 _fetch_doc
1012    
1013 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1014     C<etch_doc>, C<etch_doc_by_uri>.
1015 dpavlin 43
1016 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
1017     my $doc = $node->_fetch_doc( id => 42 );
1018     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1019 dpavlin 43
1020 dpavlin 45 # to extract keywords, add etch
1021     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1022     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1023    
1024 dpavlin 49 # to get document attrubute add attr
1025     my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1026     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1027    
1028 dpavlin 45 # more general form which allows implementation of
1029     # uri_to_id
1030     my $id = $node->_fetch_doc(
1031     uri => 'file:///document/uri/42',
1032     path => '/uri_to_id',
1033     chomp_resbody => 1
1034     );
1035    
1036 dpavlin 43 =cut
1037    
1038     sub _fetch_doc {
1039     my $self = shift;
1040 dpavlin 44 my $a = {@_};
1041     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1042    
1043     my ($arg, $resbody);
1044    
1045 dpavlin 45 my $path = $a->{path} || '/get_doc';
1046 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1047    
1048     if ($a->{id}) {
1049     croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1050     $arg = 'id=' . $a->{id};
1051     } elsif ($a->{uri}) {
1052 dpavlin 50 $arg = 'uri=' . uri_escape($a->{uri});
1053 dpavlin 44 } else {
1054     confess "unhandled argument. Need id or uri.";
1055 dpavlin 43 }
1056 dpavlin 44
1057 dpavlin 49 if ($a->{attr}) {
1058     $path = '/get_doc_attr';
1059     $arg .= '&attr=' . uri_escape($a->{attr});
1060     $a->{chomp_resbody} = 1;
1061     }
1062    
1063 dpavlin 44 my $rv = $self->shuttle_url( $self->{url} . $path,
1064 dpavlin 43 'application/x-www-form-urlencoded',
1065 dpavlin 44 $arg,
1066 dpavlin 45 \$resbody,
1067 dpavlin 43 );
1068 dpavlin 44
1069 dpavlin 43 return if ($rv != 200);
1070 dpavlin 44
1071     if ($a->{etch}) {
1072     $self->{kwords} = {};
1073     return +{} unless ($resbody);
1074     foreach my $l (split(/\n/, $resbody)) {
1075     my ($k,$v) = split(/\t/, $l, 2);
1076     $self->{kwords}->{$k} = $v if ($v);
1077     }
1078     return $self->{kwords};
1079 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1080     return unless (defined($resbody));
1081     chomp($resbody);
1082     return $resbody;
1083 dpavlin 44 } else {
1084     return new Search::Estraier::Document($resbody);
1085     }
1086 dpavlin 43 }
1087    
1088    
1089 dpavlin 48 =head2 name
1090 dpavlin 43
1091 dpavlin 48 my $node_name = $node->name;
1092 dpavlin 43
1093 dpavlin 48 =cut
1094    
1095     sub name {
1096     my $self = shift;
1097 dpavlin 55 $self->_set_info unless ($self->{name});
1098 dpavlin 48 return $self->{name};
1099     }
1100    
1101    
1102     =head2 label
1103    
1104     my $node_label = $node->label;
1105    
1106     =cut
1107    
1108     sub label {
1109     my $self = shift;
1110 dpavlin 55 $self->_set_info unless ($self->{label});
1111 dpavlin 48 return $self->{label};
1112     }
1113    
1114    
1115     =head2 doc_num
1116    
1117     my $documents_in_node = $node->doc_num;
1118    
1119     =cut
1120    
1121     sub doc_num {
1122     my $self = shift;
1123 dpavlin 55 $self->_set_info if ($self->{dnum} < 0);
1124 dpavlin 48 return $self->{dnum};
1125     }
1126    
1127    
1128     =head2 word_num
1129    
1130     my $words_in_node = $node->word_num;
1131    
1132     =cut
1133    
1134     sub word_num {
1135     my $self = shift;
1136 dpavlin 55 $self->_set_info if ($self->{wnum} < 0);
1137 dpavlin 48 return $self->{wnum};
1138     }
1139    
1140    
1141     =head2 size
1142    
1143     my $node_size = $node->size;
1144    
1145     =cut
1146    
1147     sub size {
1148     my $self = shift;
1149 dpavlin 55 $self->_set_info if ($self->{size} < 0);
1150 dpavlin 48 return $self->{size};
1151     }
1152    
1153    
1154 dpavlin 51 =head2 search
1155 dpavlin 48
1156 dpavlin 51 Search documents which match condition
1157    
1158     my $nres = $node->search( $cond, $depth );
1159    
1160     C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1161     depth for meta search.
1162    
1163     Function results C<Search::Estraier::NodeResult> object.
1164    
1165     =cut
1166    
1167     sub search {
1168     my $self = shift;
1169     my ($cond, $depth) = @_;
1170     return unless ($cond && defined($depth) && $self->{url});
1171     croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1172     croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1173    
1174 dpavlin 52 my $resbody;
1175 dpavlin 51
1176 dpavlin 52 my $rv = $self->shuttle_url( $self->{url} . '/search',
1177 dpavlin 53 'application/x-www-form-urlencoded',
1178 dpavlin 61 $self->cond_to_query( $cond, $depth ),
1179 dpavlin 52 \$resbody,
1180     );
1181     return if ($rv != 200);
1182    
1183     my (@docs, $hints);
1184    
1185     my @lines = split(/\n/, $resbody);
1186     return unless (@lines);
1187    
1188     my $border = $lines[0];
1189     my $isend = 0;
1190     my $lnum = 1;
1191    
1192     while ( $lnum <= $#lines ) {
1193     my $line = $lines[$lnum];
1194     $lnum++;
1195    
1196     #warn "## $line\n";
1197     if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1198     $isend = $1;
1199     last;
1200     }
1201    
1202     if ($line =~ /\t/) {
1203     my ($k,$v) = split(/\t/, $line, 2);
1204     $hints->{$k} = $v;
1205     }
1206     }
1207    
1208     my $snum = $lnum;
1209    
1210     while( ! $isend && $lnum <= $#lines ) {
1211     my $line = $lines[$lnum];
1212 dpavlin 53 #warn "# $lnum: $line\n";
1213 dpavlin 52 $lnum++;
1214    
1215     if ($line && $line =~ m/^\Q$border\E/) {
1216     if ($lnum > $snum) {
1217     my $rdattrs;
1218     my $rdvector;
1219     my $rdsnippet;
1220    
1221     my $rlnum = $snum;
1222     while ($rlnum < $lnum - 1 ) {
1223     #my $rdline = $self->_s($lines[$rlnum]);
1224     my $rdline = $lines[$rlnum];
1225     $rlnum++;
1226     last unless ($rdline);
1227     if ($rdline =~ /^%/) {
1228     $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1229 dpavlin 53 } elsif($rdline =~ /=/) {
1230     $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1231 dpavlin 52 } else {
1232 dpavlin 53 confess "invalid format of response";
1233 dpavlin 52 }
1234     }
1235     while($rlnum < $lnum - 1) {
1236     my $rdline = $lines[$rlnum];
1237     $rlnum++;
1238     $rdsnippet .= "$rdline\n";
1239     }
1240 dpavlin 53 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1241 dpavlin 52 if (my $rduri = $rdattrs->{'@uri'}) {
1242     push @docs, new Search::Estraier::ResultDocument(
1243     uri => $rduri,
1244     attrs => $rdattrs,
1245     snippet => $rdsnippet,
1246     keywords => $rdvector,
1247     );
1248     }
1249     }
1250     $snum = $lnum;
1251     #warn "### $line\n";
1252     $isend = 1 if ($line =~ /:END$/);
1253     }
1254    
1255     }
1256    
1257     if (! $isend) {
1258     warn "received result doesn't have :END\n$resbody";
1259     return;
1260     }
1261    
1262 dpavlin 53 #warn Dumper(\@docs, $hints);
1263    
1264 dpavlin 52 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1265 dpavlin 51 }
1266    
1267    
1268     =head2 cond_to_query
1269    
1270 dpavlin 55 Return URI encoded string generated from Search::Estraier::Condition
1271    
1272 dpavlin 61 my $args = $node->cond_to_query( $cond, $depth );
1273 dpavlin 51
1274     =cut
1275    
1276     sub cond_to_query {
1277     my $self = shift;
1278    
1279     my $cond = shift || return;
1280     croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1281 dpavlin 61 my $depth = shift;
1282 dpavlin 51
1283     my @args;
1284    
1285     if (my $phrase = $cond->phrase) {
1286     push @args, 'phrase=' . uri_escape($phrase);
1287     }
1288    
1289     if (my @attrs = $cond->attrs) {
1290     for my $i ( 0 .. $#attrs ) {
1291 dpavlin 63 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1292 dpavlin 51 }
1293     }
1294    
1295     if (my $order = $cond->order) {
1296     push @args, 'order=' . uri_escape($order);
1297     }
1298    
1299     if (my $max = $cond->max) {
1300     push @args, 'max=' . $max;
1301     } else {
1302     push @args, 'max=' . (1 << 30);
1303     }
1304    
1305     if (my $options = $cond->options) {
1306     push @args, 'options=' . $options;
1307     }
1308    
1309 dpavlin 61 push @args, 'depth=' . $depth if ($depth);
1310 dpavlin 51 push @args, 'wwidth=' . $self->{wwidth};
1311     push @args, 'hwidth=' . $self->{hwidth};
1312     push @args, 'awidth=' . $self->{awidth};
1313    
1314     return join('&', @args);
1315     }
1316    
1317    
1318 dpavlin 33 =head2 shuttle_url
1319 dpavlin 32
1320 dpavlin 33 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1321     master.
1322 dpavlin 2
1323 dpavlin 52 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1324 dpavlin 2
1325 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1326     body will be saved within object.
1327 dpavlin 2
1328     =cut
1329    
1330 dpavlin 59 use LWP::UserAgent;
1331    
1332 dpavlin 33 sub shuttle_url {
1333     my $self = shift;
1334 dpavlin 2
1335 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
1336 dpavlin 2
1337 dpavlin 40 $self->{status} = -1;
1338 dpavlin 33
1339 dpavlin 41 warn "## $url\n" if ($self->{debug});
1340 dpavlin 36
1341 dpavlin 33 $url = new URI($url);
1342 dpavlin 37 if (
1343     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1344     !$url->host || !$url->port || $url->port < 1
1345     ) {
1346     carp "can't parse $url\n";
1347     return -1;
1348     }
1349 dpavlin 33
1350 dpavlin 59 my $ua = LWP::UserAgent->new;
1351     $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1352 dpavlin 33
1353 dpavlin 59 my $req;
1354 dpavlin 37 if ($reqbody) {
1355 dpavlin 59 $req = HTTP::Request->new(POST => $url);
1356 dpavlin 37 } else {
1357 dpavlin 59 $req = HTTP::Request->new(GET => $url);
1358 dpavlin 37 }
1359    
1360 dpavlin 59 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1361     $req->headers->header( 'Connection', 'close' );
1362     $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );
1363     $req->content_type( $content_type );
1364 dpavlin 37
1365 dpavlin 59 warn $req->headers->as_string,"\n" if ($self->{debug});
1366 dpavlin 2
1367 dpavlin 37 if ($reqbody) {
1368 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1369 dpavlin 59 $req->content( $reqbody );
1370 dpavlin 33 }
1371 dpavlin 2
1372 dpavlin 59 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1373 dpavlin 2
1374 dpavlin 59 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1375 dpavlin 2
1376 dpavlin 59 return -1 if (! $res->is_success);
1377 dpavlin 2
1378 dpavlin 59 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1379 dpavlin 33
1380 dpavlin 59 $$resbody .= $res->content;
1381    
1382 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1383 dpavlin 39
1384 dpavlin 40 return $self->{status};
1385 dpavlin 2 }
1386    
1387 dpavlin 48
1388 dpavlin 55 =head2 set_snippet_width
1389 dpavlin 48
1390 dpavlin 55 Set width of snippets in results
1391    
1392     $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1393    
1394     C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1395     is not sent with results. If it is negative, whole document text is sent instead of snippet.
1396    
1397     C<$hwidth> specified width of strings from beginning of string. Default
1398     value is C<96>. Negative or zero value keep previous value.
1399    
1400     C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1401     If negative of zero value is provided previous value is kept unchanged.
1402    
1403     =cut
1404    
1405     sub set_snippet_width {
1406     my $self = shift;
1407    
1408     my ($wwidth, $hwidth, $awidth) = @_;
1409     $self->{wwidth} = $wwidth;
1410     $self->{hwidth} = $hwidth if ($hwidth >= 0);
1411     $self->{awidth} = $awidth if ($awidth >= 0);
1412     }
1413    
1414    
1415 dpavlin 56 =head2 set_user
1416 dpavlin 55
1417 dpavlin 56 Manage users of node
1418    
1419     $node->set_user( 'name', $mode );
1420    
1421     C<$mode> can be one of:
1422    
1423     =over 4
1424    
1425     =item 0
1426    
1427     delete account
1428    
1429     =item 1
1430    
1431     set administrative right for user
1432    
1433     =item 2
1434    
1435     set user account as guest
1436    
1437     =back
1438    
1439     Return true on success, otherwise false.
1440    
1441     =cut
1442    
1443     sub set_user {
1444     my $self = shift;
1445     my ($name, $mode) = @_;
1446    
1447     return unless ($self->{url});
1448     croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1449    
1450     $self->shuttle_url( $self->{url} . '/_set_user',
1451     'text/plain',
1452     'name=' . uri_escape($name) . '&mode=' . $mode,
1453     undef
1454     ) == 200;
1455     }
1456    
1457    
1458 dpavlin 57 =head2 set_link
1459    
1460     Manage node links
1461    
1462     $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1463    
1464     If C<$credit> is negative, link is removed.
1465    
1466     =cut
1467    
1468     sub set_link {
1469     my $self = shift;
1470     my ($url, $label, $credit) = @_;
1471    
1472     return unless ($self->{url});
1473     croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1474    
1475     my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1476     $reqbody .= '&credit=' . $credit if ($credit > 0);
1477    
1478     $self->shuttle_url( $self->{url} . '/_set_link',
1479     'text/plain',
1480     $reqbody,
1481     undef
1482     ) == 200;
1483     }
1484    
1485    
1486 dpavlin 55 =head1 PRIVATE METHODS
1487    
1488     You could call those directly, but you don't have to. I hope.
1489    
1490     =head2 _set_info
1491    
1492 dpavlin 48 Set information for node
1493    
1494 dpavlin 55 $node->_set_info;
1495 dpavlin 48
1496     =cut
1497    
1498 dpavlin 55 sub _set_info {
1499 dpavlin 48 my $self = shift;
1500    
1501     $self->{status} = -1;
1502     return unless ($self->{url});
1503    
1504     my $resbody;
1505     my $rv = $self->shuttle_url( $self->{url} . '/inform',
1506     'text/plain',
1507     undef,
1508     \$resbody,
1509     );
1510    
1511     return if ($rv != 200 || !$resbody);
1512    
1513 dpavlin 58 # it seems that response can have multiple line endings
1514     $resbody =~ s/[\r\n]+$//;
1515 dpavlin 48
1516     ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1517     split(/\t/, $resbody, 5);
1518    
1519     }
1520    
1521 dpavlin 2 ###
1522    
1523     =head1 EXPORT
1524    
1525     Nothing.
1526    
1527     =head1 SEE ALSO
1528    
1529     L<http://hyperestraier.sourceforge.net/>
1530    
1531     Hyper Estraier Ruby interface on which this module is based.
1532    
1533     =head1 AUTHOR
1534    
1535     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1536    
1537    
1538     =head1 COPYRIGHT AND LICENSE
1539    
1540 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1541 dpavlin 2
1542     This library is free software; you can redistribute it and/or modify
1543     it under the GPL v2 or later.
1544    
1545     =cut
1546    
1547     1;

  ViewVC Help
Powered by ViewVC 1.1.26