/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 60 - (hide annotations)
Sat Jan 7 00:00:15 2006 UTC (18 years, 3 months ago) by dpavlin
File size: 27250 byte(s)
added few checks to better handle empty documents, array return is not enforced any more.
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 59 our $VERSION = '0.01';
8 dpavlin 2
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15     use Search::Estraier;
16     my $est = new Search::Estraier();
17    
18     =head1 DESCRIPTION
19    
20     This module is implementation of node API of Hyper Estraier. Since it's
21     perl-only module with dependencies only on standard perl modules, it will
22     run on all platforms on which perl runs. It doesn't require compilation
23     or Hyper Estraier development files on target machine.
24    
25     It is implemented as multiple packages which closly resamble Ruby
26     implementation. It also includes methods to manage nodes.
27    
28     =cut
29    
30 dpavlin 42 =head1 Inheritable common methods
31    
32     This methods should really move somewhere else.
33    
34 dpavlin 15 =head2 _s
35    
36     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37    
38     my $text = $self->_s(" this is a text ");
39     $text = 'this is a text';
40    
41     =cut
42    
43     sub _s {
44     my $text = $_[1] || return;
45     $text =~ s/\s\s+/ /gs;
46     $text =~ s/^\s+//;
47     $text =~ s/\s+$//;
48     return $text;
49     }
50    
51 dpavlin 2 package Search::Estraier::Document;
52    
53 dpavlin 9 use Carp qw/croak confess/;
54 dpavlin 7
55 dpavlin 15 use Search::Estraier;
56     our @ISA = qw/Search::Estraier/;
57    
58 dpavlin 2 =head1 Search::Estraier::Document
59    
60 dpavlin 14 This class implements Document which is collection of attributes
61     (key=value), vectors (also key value) display text and hidden text.
62    
63 dpavlin 42
64 dpavlin 2 =head2 new
65    
66 dpavlin 14 Create new document, empty or from draft.
67    
68 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
69 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
70 dpavlin 2
71     =cut
72    
73     sub new {
74     my $class = shift;
75 dpavlin 14 my $self = {};
76 dpavlin 2 bless($self, $class);
77    
78 dpavlin 6 $self->{id} = -1;
79    
80 dpavlin 14 my $draft = shift;
81    
82     if ($draft) {
83     my $in_text = 0;
84     foreach my $line (split(/\n/, $draft)) {
85    
86     if ($in_text) {
87     if ($line =~ /^\t/) {
88     push @{ $self->{htexts} }, substr($line, 1);
89     } else {
90     push @{ $self->{dtexts} }, $line;
91     }
92     next;
93     }
94    
95     if ($line =~ m/^%VECTOR\t(.+)$/) {
96     my @fields = split(/\t/, $1);
97     for my $i ( 0 .. ($#fields - 1) ) {
98     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99     $i++;
100     }
101     next;
102     } elsif ($line =~ m/^%/) {
103     # What is this? comment?
104     #warn "$line\n";
105     next;
106     } elsif ($line =~ m/^$/) {
107     $in_text = 1;
108     next;
109     } elsif ($line =~ m/^(.+)=(.+)$/) {
110     $self->{attrs}->{ $1 } = $2;
111     next;
112     }
113    
114     warn "draft ignored: $line\n";
115     }
116     }
117    
118 dpavlin 2 $self ? return $self : return undef;
119     }
120    
121 dpavlin 4
122 dpavlin 2 =head2 add_attr
123    
124 dpavlin 6 Add an attribute.
125    
126 dpavlin 2 $doc->add_attr( name => 'value' );
127    
128 dpavlin 9 Delete attribute using
129 dpavlin 5
130     $doc->add_attr( name => undef );
131    
132 dpavlin 2 =cut
133    
134     sub add_attr {
135     my $self = shift;
136     my $attrs = {@_};
137    
138     while (my ($name, $value) = each %{ $attrs }) {
139 dpavlin 9 if (! defined($value)) {
140 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
141 dpavlin 9 } else {
142 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 dpavlin 9 }
144 dpavlin 2 }
145 dpavlin 8
146     return 1;
147 dpavlin 2 }
148    
149 dpavlin 5
150     =head2 add_text
151    
152 dpavlin 6 Add a sentence of text.
153    
154 dpavlin 5 $doc->add_text('this is example text to display');
155    
156     =cut
157    
158     sub add_text {
159     my $self = shift;
160     my $text = shift;
161     return unless defined($text);
162    
163 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
164 dpavlin 5 }
165    
166    
167     =head2 add_hidden_text
168    
169 dpavlin 6 Add a hidden sentence.
170    
171 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
172    
173     =cut
174    
175     sub add_hidden_text {
176     my $self = shift;
177     my $text = shift;
178     return unless defined($text);
179    
180 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
181 dpavlin 5 }
182    
183 dpavlin 42
184 dpavlin 6 =head2 id
185    
186     Get the ID number of document. If the object has never been registred, C<-1> is returned.
187    
188     print $doc->id;
189    
190     =cut
191    
192     sub id {
193     my $self = shift;
194     return $self->{id};
195     }
196    
197 dpavlin 42
198 dpavlin 7 =head2 attr_names
199    
200 dpavlin 9 Returns array with attribute names from document object.
201 dpavlin 7
202     my @attrs = $doc->attr_names;
203    
204     =cut
205    
206     sub attr_names {
207     my $self = shift;
208 dpavlin 60 return unless ($self->{attrs});
209     #croak "attr_names return array, not scalar" if (! wantarray);
210 dpavlin 7 return sort keys %{ $self->{attrs} };
211     }
212    
213 dpavlin 8
214     =head2 attr
215    
216 dpavlin 9 Returns value of an attribute.
217 dpavlin 8
218     my $value = $doc->attr( 'attribute' );
219    
220     =cut
221    
222     sub attr {
223     my $self = shift;
224     my $name = shift;
225 dpavlin 60 return unless (defined($name) && $self->{attrs});
226     return $self->{attrs}->{ $name };
227 dpavlin 8 }
228    
229 dpavlin 9
230     =head2 texts
231    
232     Returns array with text sentences.
233    
234     my @texts = $doc->texts;
235    
236     =cut
237    
238     sub texts {
239     my $self = shift;
240 dpavlin 60 #confess "texts return array, not scalar" if (! wantarray);
241     return @{ $self->{dtexts} } if ($self->{dtexts});
242 dpavlin 9 }
243    
244 dpavlin 42
245 dpavlin 12 =head2 cat_texts
246    
247     Return whole text as single scalar.
248    
249     my $text = $doc->cat_texts;
250    
251     =cut
252    
253     sub cat_texts {
254     my $self = shift;
255 dpavlin 60 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
256 dpavlin 12 }
257    
258 dpavlin 42
259 dpavlin 5 =head2 dump_draft
260    
261 dpavlin 13 Dump draft data from document object.
262    
263 dpavlin 5 print $doc->dump_draft;
264    
265     =cut
266    
267     sub dump_draft {
268 dpavlin 13 my $self = shift;
269     my $draft;
270    
271     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
272     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
273     }
274    
275     if ($self->{kwords}) {
276     $draft .= '%%VECTOR';
277     while (my ($key, $value) = each %{ $self->{kwords} }) {
278     $draft .= "\t$key\t$value";
279     }
280     $draft .= "\n";
281     }
282    
283     $draft .= "\n";
284    
285 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
286     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
287 dpavlin 13
288     return $draft;
289 dpavlin 5 }
290    
291 dpavlin 42
292 dpavlin 4 =head2 delete
293 dpavlin 2
294 dpavlin 4 Empty document object
295 dpavlin 2
296 dpavlin 4 $doc->delete;
297    
298 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
299     convinience. Document objects which go out of scope will be destroyed
300     automatically.
301    
302 dpavlin 4 =cut
303    
304     sub delete {
305     my $self = shift;
306    
307 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
308 dpavlin 5 delete($self->{$data});
309     }
310 dpavlin 4
311 dpavlin 10 $self->{id} = -1;
312    
313 dpavlin 4 return 1;
314     }
315    
316    
317    
318 dpavlin 15 package Search::Estraier::Condition;
319 dpavlin 4
320 dpavlin 16 use Carp qw/confess croak/;
321    
322 dpavlin 15 use Search::Estraier;
323     our @ISA = qw/Search::Estraier/;
324 dpavlin 4
325 dpavlin 16 =head1 Search::Estraier::Condition
326    
327     =head2 new
328    
329     my $cond = new Search::HyperEstraier::Condition;
330    
331     =cut
332    
333     sub new {
334     my $class = shift;
335     my $self = {};
336     bless($self, $class);
337    
338 dpavlin 19 $self->{max} = -1;
339     $self->{options} = 0;
340    
341 dpavlin 16 $self ? return $self : return undef;
342     }
343    
344 dpavlin 42
345 dpavlin 16 =head2 set_phrase
346    
347     $cond->set_phrase('search phrase');
348    
349     =cut
350    
351     sub set_phrase {
352     my $self = shift;
353     $self->{phrase} = $self->_s( shift );
354     }
355    
356 dpavlin 42
357 dpavlin 16 =head2 add_attr
358    
359     $cond->add_attr('@URI STRINC /~dpavlin/');
360    
361     =cut
362    
363     sub add_attr {
364     my $self = shift;
365     my $attr = shift || return;
366     push @{ $self->{attrs} }, $self->_s( $attr );
367     }
368    
369 dpavlin 42
370 dpavlin 16 =head2 set_order
371    
372     $cond->set_order('@mdate NUMD');
373    
374     =cut
375    
376     sub set_order {
377     my $self = shift;
378     $self->{order} = shift;
379     }
380    
381 dpavlin 42
382 dpavlin 16 =head2 set_max
383    
384     $cond->set_max(42);
385    
386     =cut
387    
388     sub set_max {
389     my $self = shift;
390     my $max = shift;
391 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
392 dpavlin 16 $self->{max} = $max;
393     }
394    
395 dpavlin 42
396 dpavlin 16 =head2 set_options
397    
398     $cond->set_options( SURE => 1 );
399    
400     =cut
401    
402 dpavlin 15 my $options = {
403     # check N-gram keys skipping by three
404     SURE => 1 << 0,
405     # check N-gram keys skipping by two
406     USUAL => 1 << 1,
407     # without TF-IDF tuning
408     FAST => 1 << 2,
409     # with the simplified phrase
410     AGITO => 1 << 3,
411     # check every N-gram key
412     NOIDF => 1 << 4,
413     # check N-gram keys skipping by one
414     SIMPLE => 1 << 10,
415     };
416    
417 dpavlin 16 sub set_options {
418     my $self = shift;
419     my $option = shift;
420     confess "unknown option" unless ($options->{$option});
421     $self->{options} ||= $options->{$option};
422 dpavlin 4 }
423    
424 dpavlin 42
425 dpavlin 18 =head2 phrase
426    
427     Return search phrase.
428    
429     print $cond->phrase;
430    
431     =cut
432    
433     sub phrase {
434     my $self = shift;
435     return $self->{phrase};
436     }
437    
438 dpavlin 42
439 dpavlin 19 =head2 order
440 dpavlin 18
441 dpavlin 19 Return search result order.
442    
443     print $cond->order;
444    
445     =cut
446    
447     sub order {
448     my $self = shift;
449     return $self->{order};
450     }
451    
452 dpavlin 42
453 dpavlin 19 =head2 attrs
454    
455     Return search result attrs.
456    
457     my @cond_attrs = $cond->attrs;
458    
459     =cut
460    
461     sub attrs {
462     my $self = shift;
463     #croak "attrs return array, not scalar" if (! wantarray);
464 dpavlin 60 return @{ $self->{attrs} } if ($self->{attrs});
465 dpavlin 19 }
466    
467 dpavlin 42
468 dpavlin 19 =head2 max
469    
470     Return maximum number of results.
471    
472     print $cond->max;
473    
474     C<-1> is returned for unitialized value, C<0> is unlimited.
475    
476     =cut
477    
478     sub max {
479     my $self = shift;
480     return $self->{max};
481     }
482    
483 dpavlin 42
484 dpavlin 19 =head2 options
485    
486     Return options for this condition.
487    
488     print $cond->options;
489    
490     Options are returned in numerical form.
491    
492     =cut
493    
494     sub options {
495     my $self = shift;
496     return $self->{options};
497     }
498    
499    
500 dpavlin 20 package Search::Estraier::ResultDocument;
501    
502 dpavlin 24 use Carp qw/croak/;
503 dpavlin 20
504 dpavlin 24 #use Search::Estraier;
505     #our @ISA = qw/Search::Estraier/;
506 dpavlin 20
507     =head1 Search::Estraier::ResultDocument
508    
509     =head2 new
510    
511 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
512 dpavlin 20 uri => 'http://localhost/document/uri/42',
513     attrs => {
514     foo => 1,
515     bar => 2,
516     },
517     snippet => 'this is a text of snippet'
518     keywords => 'this\tare\tkeywords'
519     );
520    
521     =cut
522    
523     sub new {
524     my $class = shift;
525     my $self = {@_};
526     bless($self, $class);
527    
528     foreach my $f (qw/uri attrs snippet keywords/) {
529     croak "missing $f for ResultDocument" unless defined($self->{$f});
530     }
531    
532     $self ? return $self : return undef;
533     }
534    
535 dpavlin 42
536 dpavlin 23 =head2 uri
537 dpavlin 20
538 dpavlin 23 Return URI of result document
539 dpavlin 20
540 dpavlin 23 print $rdoc->uri;
541    
542     =cut
543    
544     sub uri {
545     my $self = shift;
546     return $self->{uri};
547     }
548    
549    
550     =head2 attr_names
551    
552     Returns array with attribute names from result document object.
553    
554     my @attrs = $rdoc->attr_names;
555    
556     =cut
557    
558     sub attr_names {
559     my $self = shift;
560     croak "attr_names return array, not scalar" if (! wantarray);
561     return sort keys %{ $self->{attrs} };
562     }
563    
564 dpavlin 42
565 dpavlin 23 =head2 attr
566    
567     Returns value of an attribute.
568    
569     my $value = $rdoc->attr( 'attribute' );
570    
571     =cut
572    
573     sub attr {
574     my $self = shift;
575     my $name = shift || return;
576     return $self->{attrs}->{ $name };
577     }
578    
579 dpavlin 42
580 dpavlin 23 =head2 snippet
581    
582     Return snippet from result document
583    
584     print $rdoc->snippet;
585    
586     =cut
587    
588     sub snippet {
589     my $self = shift;
590     return $self->{snippet};
591     }
592    
593 dpavlin 42
594 dpavlin 23 =head2 keywords
595    
596     Return keywords from result document
597    
598     print $rdoc->keywords;
599    
600     =cut
601    
602     sub keywords {
603     my $self = shift;
604     return $self->{keywords};
605     }
606    
607    
608 dpavlin 25 package Search::Estraier::NodeResult;
609    
610     use Carp qw/croak/;
611    
612     #use Search::Estraier;
613     #our @ISA = qw/Search::Estraier/;
614    
615     =head1 Search::Estraier::NodeResult
616    
617     =head2 new
618    
619     my $res = new Search::HyperEstraier::NodeResult(
620     docs => @array_of_rdocs,
621     hits => %hash_with_hints,
622     );
623    
624     =cut
625    
626     sub new {
627     my $class = shift;
628     my $self = {@_};
629     bless($self, $class);
630    
631     foreach my $f (qw/docs hints/) {
632     croak "missing $f for ResultDocument" unless defined($self->{$f});
633     }
634    
635     $self ? return $self : return undef;
636     }
637    
638 dpavlin 42
639 dpavlin 25 =head2 doc_num
640    
641     Return number of documents
642    
643     print $res->doc_num;
644    
645     =cut
646    
647     sub doc_num {
648     my $self = shift;
649 dpavlin 53 return $#{$self->{docs}} + 1;
650 dpavlin 25 }
651    
652 dpavlin 42
653 dpavlin 25 =head2 get_doc
654    
655     Return single document
656    
657     my $doc = $res->get_doc( 42 );
658    
659     Returns undef if document doesn't exist.
660    
661     =cut
662    
663     sub get_doc {
664     my $self = shift;
665     my $num = shift;
666 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
667 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
668     return $self->{docs}->[$num];
669     }
670    
671 dpavlin 42
672 dpavlin 25 =head2 hint
673    
674     Return specific hint from results.
675    
676     print $rec->hint( 'VERSION' );
677    
678     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
679     C<TIME>, C<LINK#n>, C<VIEW>.
680    
681     =cut
682    
683     sub hint {
684     my $self = shift;
685     my $key = shift || return;
686     return $self->{hints}->{$key};
687     }
688    
689    
690 dpavlin 27 package Search::Estraier::Node;
691    
692 dpavlin 44 use Carp qw/carp croak confess/;
693 dpavlin 33 use URI;
694 dpavlin 36 use MIME::Base64;
695 dpavlin 33 use IO::Socket::INET;
696 dpavlin 49 use URI::Escape qw/uri_escape/;
697 dpavlin 29
698 dpavlin 27 =head1 Search::Estraier::Node
699    
700     =head2 new
701    
702     my $node = new Search::HyperEstraier::Node;
703    
704     =cut
705    
706     sub new {
707     my $class = shift;
708     my $self = {
709     pxport => -1,
710 dpavlin 33 timeout => 0, # this used to be -1
711 dpavlin 27 dnum => -1,
712     wnum => -1,
713     size => -1.0,
714     wwidth => 480,
715     hwidth => 96,
716     awidth => 96,
717     status => -1,
718     };
719     bless($self, $class);
720    
721 dpavlin 57 my $args = {@_};
722 dpavlin 39
723 dpavlin 57 $self->{debug} = $args->{debug};
724     warn "## Node debug on\n" if ($self->{debug});
725    
726 dpavlin 27 $self ? return $self : return undef;
727     }
728    
729 dpavlin 42
730 dpavlin 29 =head2 set_url
731    
732     Specify URL to node server
733    
734     $node->set_url('http://localhost:1978');
735    
736     =cut
737    
738     sub set_url {
739     my $self = shift;
740     $self->{url} = shift;
741     }
742    
743 dpavlin 42
744 dpavlin 29 =head2 set_proxy
745    
746     Specify proxy server to connect to node server
747    
748     $node->set_proxy('proxy.example.com', 8080);
749    
750     =cut
751    
752     sub set_proxy {
753     my $self = shift;
754     my ($host,$port) = @_;
755 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
756 dpavlin 29 $self->{pxhost} = $host;
757     $self->{pxport} = $port;
758     }
759    
760 dpavlin 42
761 dpavlin 30 =head2 set_timeout
762    
763     Specify timeout of connection in seconds
764    
765     $node->set_timeout( 15 );
766    
767     =cut
768    
769     sub set_timeout {
770     my $self = shift;
771     my $sec = shift;
772 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
773 dpavlin 30 $self->{timeout} = $sec;
774     }
775    
776 dpavlin 42
777 dpavlin 31 =head2 set_auth
778    
779     Specify name and password for authentication to node server.
780    
781     $node->set_auth('clint','eastwood');
782    
783     =cut
784    
785     sub set_auth {
786     my $self = shift;
787     my ($login,$passwd) = @_;
788 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
789     chomp($basic_auth);
790     $self->{auth} = $basic_auth;
791 dpavlin 31 }
792    
793 dpavlin 42
794 dpavlin 32 =head2 status
795    
796     Return status code of last request.
797    
798 dpavlin 40 print $node->status;
799 dpavlin 32
800     C<-1> means connection failure.
801    
802     =cut
803    
804     sub status {
805     my $self = shift;
806     return $self->{status};
807     }
808    
809 dpavlin 42
810 dpavlin 40 =head2 put_doc
811    
812 dpavlin 41 Add a document
813 dpavlin 40
814 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
815    
816     Return true on success or false on failture.
817    
818 dpavlin 40 =cut
819    
820     sub put_doc {
821     my $self = shift;
822     my $doc = shift || return;
823 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
824 dpavlin 41 $self->shuttle_url( $self->{url} . '/put_doc',
825     'text/x-estraier-draft',
826     $doc->dump_draft,
827     undef
828     ) == 200;
829 dpavlin 40 }
830    
831 dpavlin 41
832     =head2 out_doc
833    
834     Remove a document
835    
836     $node->out_doc( document_id ) or "can't remove document";
837    
838     Return true on success or false on failture.
839    
840     =cut
841    
842     sub out_doc {
843     my $self = shift;
844     my $id = shift || return;
845     return unless ($self->{url});
846 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
847 dpavlin 41 $self->shuttle_url( $self->{url} . '/out_doc',
848     'application/x-www-form-urlencoded',
849     "id=$id",
850     undef
851     ) == 200;
852     }
853    
854    
855     =head2 out_doc_by_uri
856    
857     Remove a registrated document using it's uri
858    
859 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
860 dpavlin 41
861     Return true on success or false on failture.
862    
863     =cut
864    
865     sub out_doc_by_uri {
866     my $self = shift;
867     my $uri = shift || return;
868     return unless ($self->{url});
869     $self->shuttle_url( $self->{url} . '/out_doc',
870     'application/x-www-form-urlencoded',
871 dpavlin 50 "uri=" . uri_escape($uri),
872 dpavlin 41 undef
873     ) == 200;
874     }
875    
876 dpavlin 42
877     =head2 edit_doc
878    
879     Edit attributes of a document
880    
881     $node->edit_doc( $document_draft ) or die "can't edit document";
882    
883     Return true on success or false on failture.
884    
885     =cut
886    
887     sub edit_doc {
888     my $self = shift;
889     my $doc = shift || return;
890 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
891 dpavlin 42 $self->shuttle_url( $self->{url} . '/edit_doc',
892     'text/x-estraier-draft',
893     $doc->dump_draft,
894     undef
895     ) == 200;
896     }
897    
898    
899 dpavlin 43 =head2 get_doc
900    
901     Retreive document
902    
903     my $doc = $node->get_doc( document_id ) or die "can't get document";
904    
905     Return true on success or false on failture.
906    
907     =cut
908    
909     sub get_doc {
910     my $self = shift;
911     my $id = shift || return;
912     return $self->_fetch_doc( id => $id );
913     }
914    
915 dpavlin 44
916 dpavlin 43 =head2 get_doc_by_uri
917    
918     Retreive document
919    
920 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
921 dpavlin 43
922     Return true on success or false on failture.
923    
924     =cut
925    
926     sub get_doc_by_uri {
927     my $self = shift;
928     my $uri = shift || return;
929     return $self->_fetch_doc( uri => $uri );
930     }
931    
932 dpavlin 44
933 dpavlin 49 =head2 get_doc_attr
934    
935     Retrieve the value of an atribute from object
936    
937     my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
938     die "can't get document attribute";
939    
940     =cut
941    
942     sub get_doc_attr {
943     my $self = shift;
944     my ($id,$name) = @_;
945     return unless ($id && $name);
946     return $self->_fetch_doc( id => $id, attr => $name );
947     }
948    
949    
950     =head2 get_doc_attr_by_uri
951    
952     Retrieve the value of an atribute from object
953    
954     my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
955     die "can't get document attribute";
956    
957     =cut
958    
959     sub get_doc_attr_by_uri {
960     my $self = shift;
961     my ($uri,$name) = @_;
962     return unless ($uri && $name);
963     return $self->_fetch_doc( uri => $uri, attr => $name );
964     }
965    
966    
967 dpavlin 44 =head2 etch_doc
968    
969     Exctract document keywords
970    
971     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
972    
973     =cut
974    
975 dpavlin 49 sub etch_doc {
976 dpavlin 44 my $self = shift;
977     my $id = shift || return;
978     return $self->_fetch_doc( id => $id, etch => 1 );
979     }
980    
981     =head2 etch_doc_by_uri
982    
983     Retreive document
984    
985 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
986 dpavlin 44
987     Return true on success or false on failture.
988    
989     =cut
990    
991     sub etch_doc_by_uri {
992     my $self = shift;
993     my $uri = shift || return;
994     return $self->_fetch_doc( uri => $uri, etch => 1 );
995     }
996    
997    
998 dpavlin 45 =head2 uri_to_id
999    
1000     Get ID of document specified by URI
1001    
1002     my $id = $node->uri_to_id( 'file:///document/uri/42' );
1003    
1004     =cut
1005    
1006     sub uri_to_id {
1007     my $self = shift;
1008     my $uri = shift || return;
1009     return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1010     }
1011    
1012    
1013 dpavlin 43 =head2 _fetch_doc
1014    
1015 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1016     C<etch_doc>, C<etch_doc_by_uri>.
1017 dpavlin 43
1018 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
1019     my $doc = $node->_fetch_doc( id => 42 );
1020     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1021 dpavlin 43
1022 dpavlin 45 # to extract keywords, add etch
1023     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1024     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1025    
1026 dpavlin 49 # to get document attrubute add attr
1027     my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1028     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1029    
1030 dpavlin 45 # more general form which allows implementation of
1031     # uri_to_id
1032     my $id = $node->_fetch_doc(
1033     uri => 'file:///document/uri/42',
1034     path => '/uri_to_id',
1035     chomp_resbody => 1
1036     );
1037    
1038 dpavlin 43 =cut
1039    
1040     sub _fetch_doc {
1041     my $self = shift;
1042 dpavlin 44 my $a = {@_};
1043     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1044    
1045     my ($arg, $resbody);
1046    
1047 dpavlin 45 my $path = $a->{path} || '/get_doc';
1048 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1049    
1050     if ($a->{id}) {
1051     croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1052     $arg = 'id=' . $a->{id};
1053     } elsif ($a->{uri}) {
1054 dpavlin 50 $arg = 'uri=' . uri_escape($a->{uri});
1055 dpavlin 44 } else {
1056     confess "unhandled argument. Need id or uri.";
1057 dpavlin 43 }
1058 dpavlin 44
1059 dpavlin 49 if ($a->{attr}) {
1060     $path = '/get_doc_attr';
1061     $arg .= '&attr=' . uri_escape($a->{attr});
1062     $a->{chomp_resbody} = 1;
1063     }
1064    
1065 dpavlin 44 my $rv = $self->shuttle_url( $self->{url} . $path,
1066 dpavlin 43 'application/x-www-form-urlencoded',
1067 dpavlin 44 $arg,
1068 dpavlin 45 \$resbody,
1069 dpavlin 43 );
1070 dpavlin 44
1071 dpavlin 43 return if ($rv != 200);
1072 dpavlin 44
1073     if ($a->{etch}) {
1074     $self->{kwords} = {};
1075     return +{} unless ($resbody);
1076     foreach my $l (split(/\n/, $resbody)) {
1077     my ($k,$v) = split(/\t/, $l, 2);
1078     $self->{kwords}->{$k} = $v if ($v);
1079     }
1080     return $self->{kwords};
1081 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1082     return unless (defined($resbody));
1083     chomp($resbody);
1084     return $resbody;
1085 dpavlin 44 } else {
1086     return new Search::Estraier::Document($resbody);
1087     }
1088 dpavlin 43 }
1089    
1090    
1091 dpavlin 48 =head2 name
1092 dpavlin 43
1093 dpavlin 48 my $node_name = $node->name;
1094 dpavlin 43
1095 dpavlin 48 =cut
1096    
1097     sub name {
1098     my $self = shift;
1099 dpavlin 55 $self->_set_info unless ($self->{name});
1100 dpavlin 48 return $self->{name};
1101     }
1102    
1103    
1104     =head2 label
1105    
1106     my $node_label = $node->label;
1107    
1108     =cut
1109    
1110     sub label {
1111     my $self = shift;
1112 dpavlin 55 $self->_set_info unless ($self->{label});
1113 dpavlin 48 return $self->{label};
1114     }
1115    
1116    
1117     =head2 doc_num
1118    
1119     my $documents_in_node = $node->doc_num;
1120    
1121     =cut
1122    
1123     sub doc_num {
1124     my $self = shift;
1125 dpavlin 55 $self->_set_info if ($self->{dnum} < 0);
1126 dpavlin 48 return $self->{dnum};
1127     }
1128    
1129    
1130     =head2 word_num
1131    
1132     my $words_in_node = $node->word_num;
1133    
1134     =cut
1135    
1136     sub word_num {
1137     my $self = shift;
1138 dpavlin 55 $self->_set_info if ($self->{wnum} < 0);
1139 dpavlin 48 return $self->{wnum};
1140     }
1141    
1142    
1143     =head2 size
1144    
1145     my $node_size = $node->size;
1146    
1147     =cut
1148    
1149     sub size {
1150     my $self = shift;
1151 dpavlin 55 $self->_set_info if ($self->{size} < 0);
1152 dpavlin 48 return $self->{size};
1153     }
1154    
1155    
1156 dpavlin 51 =head2 search
1157 dpavlin 48
1158 dpavlin 51 Search documents which match condition
1159    
1160     my $nres = $node->search( $cond, $depth );
1161    
1162     C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1163     depth for meta search.
1164    
1165     Function results C<Search::Estraier::NodeResult> object.
1166    
1167     =cut
1168    
1169     sub search {
1170     my $self = shift;
1171     my ($cond, $depth) = @_;
1172     return unless ($cond && defined($depth) && $self->{url});
1173     croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1174     croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1175    
1176 dpavlin 52 my $resbody;
1177 dpavlin 51
1178 dpavlin 52 my $rv = $self->shuttle_url( $self->{url} . '/search',
1179 dpavlin 53 'application/x-www-form-urlencoded',
1180 dpavlin 52 $self->cond_to_query( $cond ),
1181     \$resbody,
1182     );
1183     return if ($rv != 200);
1184    
1185     my (@docs, $hints);
1186    
1187     my @lines = split(/\n/, $resbody);
1188     return unless (@lines);
1189    
1190     my $border = $lines[0];
1191     my $isend = 0;
1192     my $lnum = 1;
1193    
1194     while ( $lnum <= $#lines ) {
1195     my $line = $lines[$lnum];
1196     $lnum++;
1197    
1198     #warn "## $line\n";
1199     if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1200     $isend = $1;
1201     last;
1202     }
1203    
1204     if ($line =~ /\t/) {
1205     my ($k,$v) = split(/\t/, $line, 2);
1206     $hints->{$k} = $v;
1207     }
1208     }
1209    
1210     my $snum = $lnum;
1211    
1212     while( ! $isend && $lnum <= $#lines ) {
1213     my $line = $lines[$lnum];
1214 dpavlin 53 #warn "# $lnum: $line\n";
1215 dpavlin 52 $lnum++;
1216    
1217     if ($line && $line =~ m/^\Q$border\E/) {
1218     if ($lnum > $snum) {
1219     my $rdattrs;
1220     my $rdvector;
1221     my $rdsnippet;
1222    
1223     my $rlnum = $snum;
1224     while ($rlnum < $lnum - 1 ) {
1225     #my $rdline = $self->_s($lines[$rlnum]);
1226     my $rdline = $lines[$rlnum];
1227     $rlnum++;
1228     last unless ($rdline);
1229     if ($rdline =~ /^%/) {
1230     $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1231 dpavlin 53 } elsif($rdline =~ /=/) {
1232     $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1233 dpavlin 52 } else {
1234 dpavlin 53 confess "invalid format of response";
1235 dpavlin 52 }
1236     }
1237     while($rlnum < $lnum - 1) {
1238     my $rdline = $lines[$rlnum];
1239     $rlnum++;
1240     $rdsnippet .= "$rdline\n";
1241     }
1242 dpavlin 53 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1243 dpavlin 52 if (my $rduri = $rdattrs->{'@uri'}) {
1244     push @docs, new Search::Estraier::ResultDocument(
1245     uri => $rduri,
1246     attrs => $rdattrs,
1247     snippet => $rdsnippet,
1248     keywords => $rdvector,
1249     );
1250     }
1251     }
1252     $snum = $lnum;
1253     #warn "### $line\n";
1254     $isend = 1 if ($line =~ /:END$/);
1255     }
1256    
1257     }
1258    
1259     if (! $isend) {
1260     warn "received result doesn't have :END\n$resbody";
1261     return;
1262     }
1263    
1264 dpavlin 53 #warn Dumper(\@docs, $hints);
1265    
1266 dpavlin 52 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1267 dpavlin 51 }
1268    
1269    
1270     =head2 cond_to_query
1271    
1272 dpavlin 55 Return URI encoded string generated from Search::Estraier::Condition
1273    
1274 dpavlin 51 my $args = $node->cond_to_query( $cond );
1275    
1276     =cut
1277    
1278     sub cond_to_query {
1279     my $self = shift;
1280    
1281     my $cond = shift || return;
1282     croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1283    
1284     my @args;
1285    
1286     if (my $phrase = $cond->phrase) {
1287     push @args, 'phrase=' . uri_escape($phrase);
1288     }
1289    
1290     if (my @attrs = $cond->attrs) {
1291     for my $i ( 0 .. $#attrs ) {
1292     push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );
1293     }
1294     }
1295    
1296     if (my $order = $cond->order) {
1297     push @args, 'order=' . uri_escape($order);
1298     }
1299    
1300     if (my $max = $cond->max) {
1301     push @args, 'max=' . $max;
1302     } else {
1303     push @args, 'max=' . (1 << 30);
1304     }
1305    
1306     if (my $options = $cond->options) {
1307     push @args, 'options=' . $options;
1308     }
1309    
1310     push @args, 'depth=' . $self->{depth} if ($self->{depth});
1311     push @args, 'wwidth=' . $self->{wwidth};
1312     push @args, 'hwidth=' . $self->{hwidth};
1313     push @args, 'awidth=' . $self->{awidth};
1314    
1315     return join('&', @args);
1316     }
1317    
1318    
1319 dpavlin 33 =head2 shuttle_url
1320 dpavlin 32
1321 dpavlin 33 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1322     master.
1323 dpavlin 2
1324 dpavlin 52 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1325 dpavlin 2
1326 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1327     body will be saved within object.
1328 dpavlin 2
1329     =cut
1330    
1331 dpavlin 59 use LWP::UserAgent;
1332    
1333 dpavlin 33 sub shuttle_url {
1334     my $self = shift;
1335 dpavlin 2
1336 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
1337 dpavlin 2
1338 dpavlin 40 $self->{status} = -1;
1339 dpavlin 33
1340 dpavlin 41 warn "## $url\n" if ($self->{debug});
1341 dpavlin 36
1342 dpavlin 33 $url = new URI($url);
1343 dpavlin 37 if (
1344     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1345     !$url->host || !$url->port || $url->port < 1
1346     ) {
1347     carp "can't parse $url\n";
1348     return -1;
1349     }
1350 dpavlin 33
1351 dpavlin 59 my $ua = LWP::UserAgent->new;
1352     $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1353 dpavlin 33
1354 dpavlin 59 my $req;
1355 dpavlin 37 if ($reqbody) {
1356 dpavlin 59 $req = HTTP::Request->new(POST => $url);
1357 dpavlin 37 } else {
1358 dpavlin 59 $req = HTTP::Request->new(GET => $url);
1359 dpavlin 37 }
1360    
1361 dpavlin 59 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1362     $req->headers->header( 'Connection', 'close' );
1363     $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );
1364     $req->content_type( $content_type );
1365 dpavlin 37
1366 dpavlin 59 warn $req->headers->as_string,"\n" if ($self->{debug});
1367 dpavlin 2
1368 dpavlin 37 if ($reqbody) {
1369 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1370 dpavlin 59 $req->content( $reqbody );
1371 dpavlin 33 }
1372 dpavlin 2
1373 dpavlin 59 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1374 dpavlin 2
1375 dpavlin 59 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1376 dpavlin 2
1377 dpavlin 59 return -1 if (! $res->is_success);
1378 dpavlin 2
1379 dpavlin 59 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1380 dpavlin 33
1381 dpavlin 59 $$resbody .= $res->content;
1382    
1383 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1384 dpavlin 39
1385 dpavlin 40 return $self->{status};
1386 dpavlin 2 }
1387    
1388 dpavlin 48
1389 dpavlin 55 =head2 set_snippet_width
1390 dpavlin 48
1391 dpavlin 55 Set width of snippets in results
1392    
1393     $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1394    
1395     C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1396     is not sent with results. If it is negative, whole document text is sent instead of snippet.
1397    
1398     C<$hwidth> specified width of strings from beginning of string. Default
1399     value is C<96>. Negative or zero value keep previous value.
1400    
1401     C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1402     If negative of zero value is provided previous value is kept unchanged.
1403    
1404     =cut
1405    
1406     sub set_snippet_width {
1407     my $self = shift;
1408    
1409     my ($wwidth, $hwidth, $awidth) = @_;
1410     $self->{wwidth} = $wwidth;
1411     $self->{hwidth} = $hwidth if ($hwidth >= 0);
1412     $self->{awidth} = $awidth if ($awidth >= 0);
1413     }
1414    
1415    
1416 dpavlin 56 =head2 set_user
1417 dpavlin 55
1418 dpavlin 56 Manage users of node
1419    
1420     $node->set_user( 'name', $mode );
1421    
1422     C<$mode> can be one of:
1423    
1424     =over 4
1425    
1426     =item 0
1427    
1428     delete account
1429    
1430     =item 1
1431    
1432     set administrative right for user
1433    
1434     =item 2
1435    
1436     set user account as guest
1437    
1438     =back
1439    
1440     Return true on success, otherwise false.
1441    
1442     =cut
1443    
1444     sub set_user {
1445     my $self = shift;
1446     my ($name, $mode) = @_;
1447    
1448     return unless ($self->{url});
1449     croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1450    
1451     $self->shuttle_url( $self->{url} . '/_set_user',
1452     'text/plain',
1453     'name=' . uri_escape($name) . '&mode=' . $mode,
1454     undef
1455     ) == 200;
1456     }
1457    
1458    
1459 dpavlin 57 =head2 set_link
1460    
1461     Manage node links
1462    
1463     $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1464    
1465     If C<$credit> is negative, link is removed.
1466    
1467     =cut
1468    
1469     sub set_link {
1470     my $self = shift;
1471     my ($url, $label, $credit) = @_;
1472    
1473     return unless ($self->{url});
1474     croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1475    
1476     my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1477     $reqbody .= '&credit=' . $credit if ($credit > 0);
1478    
1479     $self->shuttle_url( $self->{url} . '/_set_link',
1480     'text/plain',
1481     $reqbody,
1482     undef
1483     ) == 200;
1484     }
1485    
1486    
1487 dpavlin 55 =head1 PRIVATE METHODS
1488    
1489     You could call those directly, but you don't have to. I hope.
1490    
1491     =head2 _set_info
1492    
1493 dpavlin 48 Set information for node
1494    
1495 dpavlin 55 $node->_set_info;
1496 dpavlin 48
1497     =cut
1498    
1499 dpavlin 55 sub _set_info {
1500 dpavlin 48 my $self = shift;
1501    
1502     $self->{status} = -1;
1503     return unless ($self->{url});
1504    
1505     my $resbody;
1506     my $rv = $self->shuttle_url( $self->{url} . '/inform',
1507     'text/plain',
1508     undef,
1509     \$resbody,
1510     );
1511    
1512     return if ($rv != 200 || !$resbody);
1513    
1514 dpavlin 58 # it seems that response can have multiple line endings
1515     $resbody =~ s/[\r\n]+$//;
1516 dpavlin 48
1517     ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1518     split(/\t/, $resbody, 5);
1519    
1520     }
1521    
1522 dpavlin 2 ###
1523    
1524     =head1 EXPORT
1525    
1526     Nothing.
1527    
1528     =head1 SEE ALSO
1529    
1530     L<http://hyperestraier.sourceforge.net/>
1531    
1532     Hyper Estraier Ruby interface on which this module is based.
1533    
1534     =head1 AUTHOR
1535    
1536     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1537    
1538    
1539     =head1 COPYRIGHT AND LICENSE
1540    
1541 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1542 dpavlin 2
1543     This library is free software; you can redistribute it and/or modify
1544     it under the GPL v2 or later.
1545    
1546     =cut
1547    
1548     1;

  ViewVC Help
Powered by ViewVC 1.1.26