/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 61 - (hide annotations)
Sat Jan 7 01:21:28 2006 UTC (18 years, 3 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 27270 byte(s)
transfer depth to cond_to_query
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 59 our $VERSION = '0.01';
8 dpavlin 2
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15     use Search::Estraier;
16     my $est = new Search::Estraier();
17    
18     =head1 DESCRIPTION
19    
20     This module is implementation of node API of Hyper Estraier. Since it's
21     perl-only module with dependencies only on standard perl modules, it will
22     run on all platforms on which perl runs. It doesn't require compilation
23     or Hyper Estraier development files on target machine.
24    
25     It is implemented as multiple packages which closly resamble Ruby
26     implementation. It also includes methods to manage nodes.
27    
28     =cut
29    
30 dpavlin 42 =head1 Inheritable common methods
31    
32     This methods should really move somewhere else.
33    
34 dpavlin 15 =head2 _s
35    
36     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37    
38     my $text = $self->_s(" this is a text ");
39     $text = 'this is a text';
40    
41     =cut
42    
43     sub _s {
44     my $text = $_[1] || return;
45     $text =~ s/\s\s+/ /gs;
46     $text =~ s/^\s+//;
47     $text =~ s/\s+$//;
48     return $text;
49     }
50    
51 dpavlin 2 package Search::Estraier::Document;
52    
53 dpavlin 9 use Carp qw/croak confess/;
54 dpavlin 7
55 dpavlin 15 use Search::Estraier;
56     our @ISA = qw/Search::Estraier/;
57    
58 dpavlin 2 =head1 Search::Estraier::Document
59    
60 dpavlin 14 This class implements Document which is collection of attributes
61     (key=value), vectors (also key value) display text and hidden text.
62    
63 dpavlin 42
64 dpavlin 2 =head2 new
65    
66 dpavlin 14 Create new document, empty or from draft.
67    
68 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
69 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
70 dpavlin 2
71     =cut
72    
73     sub new {
74     my $class = shift;
75 dpavlin 14 my $self = {};
76 dpavlin 2 bless($self, $class);
77    
78 dpavlin 6 $self->{id} = -1;
79    
80 dpavlin 14 my $draft = shift;
81    
82     if ($draft) {
83     my $in_text = 0;
84     foreach my $line (split(/\n/, $draft)) {
85    
86     if ($in_text) {
87     if ($line =~ /^\t/) {
88     push @{ $self->{htexts} }, substr($line, 1);
89     } else {
90     push @{ $self->{dtexts} }, $line;
91     }
92     next;
93     }
94    
95     if ($line =~ m/^%VECTOR\t(.+)$/) {
96     my @fields = split(/\t/, $1);
97     for my $i ( 0 .. ($#fields - 1) ) {
98     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99     $i++;
100     }
101     next;
102     } elsif ($line =~ m/^%/) {
103     # What is this? comment?
104     #warn "$line\n";
105     next;
106     } elsif ($line =~ m/^$/) {
107     $in_text = 1;
108     next;
109     } elsif ($line =~ m/^(.+)=(.+)$/) {
110     $self->{attrs}->{ $1 } = $2;
111     next;
112     }
113    
114     warn "draft ignored: $line\n";
115     }
116     }
117    
118 dpavlin 2 $self ? return $self : return undef;
119     }
120    
121 dpavlin 4
122 dpavlin 2 =head2 add_attr
123    
124 dpavlin 6 Add an attribute.
125    
126 dpavlin 2 $doc->add_attr( name => 'value' );
127    
128 dpavlin 9 Delete attribute using
129 dpavlin 5
130     $doc->add_attr( name => undef );
131    
132 dpavlin 2 =cut
133    
134     sub add_attr {
135     my $self = shift;
136     my $attrs = {@_};
137    
138     while (my ($name, $value) = each %{ $attrs }) {
139 dpavlin 9 if (! defined($value)) {
140 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
141 dpavlin 9 } else {
142 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 dpavlin 9 }
144 dpavlin 2 }
145 dpavlin 8
146     return 1;
147 dpavlin 2 }
148    
149 dpavlin 5
150     =head2 add_text
151    
152 dpavlin 6 Add a sentence of text.
153    
154 dpavlin 5 $doc->add_text('this is example text to display');
155    
156     =cut
157    
158     sub add_text {
159     my $self = shift;
160     my $text = shift;
161     return unless defined($text);
162    
163 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
164 dpavlin 5 }
165    
166    
167     =head2 add_hidden_text
168    
169 dpavlin 6 Add a hidden sentence.
170    
171 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
172    
173     =cut
174    
175     sub add_hidden_text {
176     my $self = shift;
177     my $text = shift;
178     return unless defined($text);
179    
180 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
181 dpavlin 5 }
182    
183 dpavlin 42
184 dpavlin 6 =head2 id
185    
186     Get the ID number of document. If the object has never been registred, C<-1> is returned.
187    
188     print $doc->id;
189    
190     =cut
191    
192     sub id {
193     my $self = shift;
194     return $self->{id};
195     }
196    
197 dpavlin 42
198 dpavlin 7 =head2 attr_names
199    
200 dpavlin 9 Returns array with attribute names from document object.
201 dpavlin 7
202     my @attrs = $doc->attr_names;
203    
204     =cut
205    
206     sub attr_names {
207     my $self = shift;
208 dpavlin 60 return unless ($self->{attrs});
209     #croak "attr_names return array, not scalar" if (! wantarray);
210 dpavlin 7 return sort keys %{ $self->{attrs} };
211     }
212    
213 dpavlin 8
214     =head2 attr
215    
216 dpavlin 9 Returns value of an attribute.
217 dpavlin 8
218     my $value = $doc->attr( 'attribute' );
219    
220     =cut
221    
222     sub attr {
223     my $self = shift;
224     my $name = shift;
225 dpavlin 60 return unless (defined($name) && $self->{attrs});
226     return $self->{attrs}->{ $name };
227 dpavlin 8 }
228    
229 dpavlin 9
230     =head2 texts
231    
232     Returns array with text sentences.
233    
234     my @texts = $doc->texts;
235    
236     =cut
237    
238     sub texts {
239     my $self = shift;
240 dpavlin 60 #confess "texts return array, not scalar" if (! wantarray);
241     return @{ $self->{dtexts} } if ($self->{dtexts});
242 dpavlin 9 }
243    
244 dpavlin 42
245 dpavlin 12 =head2 cat_texts
246    
247     Return whole text as single scalar.
248    
249     my $text = $doc->cat_texts;
250    
251     =cut
252    
253     sub cat_texts {
254     my $self = shift;
255 dpavlin 60 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
256 dpavlin 12 }
257    
258 dpavlin 42
259 dpavlin 5 =head2 dump_draft
260    
261 dpavlin 13 Dump draft data from document object.
262    
263 dpavlin 5 print $doc->dump_draft;
264    
265     =cut
266    
267     sub dump_draft {
268 dpavlin 13 my $self = shift;
269     my $draft;
270    
271     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
272     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
273     }
274    
275     if ($self->{kwords}) {
276     $draft .= '%%VECTOR';
277     while (my ($key, $value) = each %{ $self->{kwords} }) {
278     $draft .= "\t$key\t$value";
279     }
280     $draft .= "\n";
281     }
282    
283     $draft .= "\n";
284    
285 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
286     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
287 dpavlin 13
288     return $draft;
289 dpavlin 5 }
290    
291 dpavlin 42
292 dpavlin 4 =head2 delete
293 dpavlin 2
294 dpavlin 4 Empty document object
295 dpavlin 2
296 dpavlin 4 $doc->delete;
297    
298 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
299     convinience. Document objects which go out of scope will be destroyed
300     automatically.
301    
302 dpavlin 4 =cut
303    
304     sub delete {
305     my $self = shift;
306    
307 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
308 dpavlin 5 delete($self->{$data});
309     }
310 dpavlin 4
311 dpavlin 10 $self->{id} = -1;
312    
313 dpavlin 4 return 1;
314     }
315    
316    
317    
318 dpavlin 15 package Search::Estraier::Condition;
319 dpavlin 4
320 dpavlin 16 use Carp qw/confess croak/;
321    
322 dpavlin 15 use Search::Estraier;
323     our @ISA = qw/Search::Estraier/;
324 dpavlin 4
325 dpavlin 16 =head1 Search::Estraier::Condition
326    
327     =head2 new
328    
329     my $cond = new Search::HyperEstraier::Condition;
330    
331     =cut
332    
333     sub new {
334     my $class = shift;
335     my $self = {};
336     bless($self, $class);
337    
338 dpavlin 19 $self->{max} = -1;
339     $self->{options} = 0;
340    
341 dpavlin 16 $self ? return $self : return undef;
342     }
343    
344 dpavlin 42
345 dpavlin 16 =head2 set_phrase
346    
347     $cond->set_phrase('search phrase');
348    
349     =cut
350    
351     sub set_phrase {
352     my $self = shift;
353     $self->{phrase} = $self->_s( shift );
354     }
355    
356 dpavlin 42
357 dpavlin 16 =head2 add_attr
358    
359     $cond->add_attr('@URI STRINC /~dpavlin/');
360    
361     =cut
362    
363     sub add_attr {
364     my $self = shift;
365     my $attr = shift || return;
366     push @{ $self->{attrs} }, $self->_s( $attr );
367     }
368    
369 dpavlin 42
370 dpavlin 16 =head2 set_order
371    
372     $cond->set_order('@mdate NUMD');
373    
374     =cut
375    
376     sub set_order {
377     my $self = shift;
378     $self->{order} = shift;
379     }
380    
381 dpavlin 42
382 dpavlin 16 =head2 set_max
383    
384     $cond->set_max(42);
385    
386     =cut
387    
388     sub set_max {
389     my $self = shift;
390     my $max = shift;
391 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
392 dpavlin 16 $self->{max} = $max;
393     }
394    
395 dpavlin 42
396 dpavlin 16 =head2 set_options
397    
398     $cond->set_options( SURE => 1 );
399    
400     =cut
401    
402 dpavlin 15 my $options = {
403     # check N-gram keys skipping by three
404     SURE => 1 << 0,
405     # check N-gram keys skipping by two
406     USUAL => 1 << 1,
407     # without TF-IDF tuning
408     FAST => 1 << 2,
409     # with the simplified phrase
410     AGITO => 1 << 3,
411     # check every N-gram key
412     NOIDF => 1 << 4,
413     # check N-gram keys skipping by one
414     SIMPLE => 1 << 10,
415     };
416    
417 dpavlin 16 sub set_options {
418     my $self = shift;
419     my $option = shift;
420     confess "unknown option" unless ($options->{$option});
421     $self->{options} ||= $options->{$option};
422 dpavlin 4 }
423    
424 dpavlin 42
425 dpavlin 18 =head2 phrase
426    
427     Return search phrase.
428    
429     print $cond->phrase;
430    
431     =cut
432    
433     sub phrase {
434     my $self = shift;
435     return $self->{phrase};
436     }
437    
438 dpavlin 42
439 dpavlin 19 =head2 order
440 dpavlin 18
441 dpavlin 19 Return search result order.
442    
443     print $cond->order;
444    
445     =cut
446    
447     sub order {
448     my $self = shift;
449     return $self->{order};
450     }
451    
452 dpavlin 42
453 dpavlin 19 =head2 attrs
454    
455     Return search result attrs.
456    
457     my @cond_attrs = $cond->attrs;
458    
459     =cut
460    
461     sub attrs {
462     my $self = shift;
463     #croak "attrs return array, not scalar" if (! wantarray);
464 dpavlin 60 return @{ $self->{attrs} } if ($self->{attrs});
465 dpavlin 19 }
466    
467 dpavlin 42
468 dpavlin 19 =head2 max
469    
470     Return maximum number of results.
471    
472     print $cond->max;
473    
474     C<-1> is returned for unitialized value, C<0> is unlimited.
475    
476     =cut
477    
478     sub max {
479     my $self = shift;
480     return $self->{max};
481     }
482    
483 dpavlin 42
484 dpavlin 19 =head2 options
485    
486     Return options for this condition.
487    
488     print $cond->options;
489    
490     Options are returned in numerical form.
491    
492     =cut
493    
494     sub options {
495     my $self = shift;
496     return $self->{options};
497     }
498    
499    
500 dpavlin 20 package Search::Estraier::ResultDocument;
501    
502 dpavlin 24 use Carp qw/croak/;
503 dpavlin 20
504 dpavlin 24 #use Search::Estraier;
505     #our @ISA = qw/Search::Estraier/;
506 dpavlin 20
507     =head1 Search::Estraier::ResultDocument
508    
509     =head2 new
510    
511 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
512 dpavlin 20 uri => 'http://localhost/document/uri/42',
513     attrs => {
514     foo => 1,
515     bar => 2,
516     },
517     snippet => 'this is a text of snippet'
518     keywords => 'this\tare\tkeywords'
519     );
520    
521     =cut
522    
523     sub new {
524     my $class = shift;
525     my $self = {@_};
526     bless($self, $class);
527    
528     foreach my $f (qw/uri attrs snippet keywords/) {
529     croak "missing $f for ResultDocument" unless defined($self->{$f});
530     }
531    
532     $self ? return $self : return undef;
533     }
534    
535 dpavlin 42
536 dpavlin 23 =head2 uri
537 dpavlin 20
538 dpavlin 23 Return URI of result document
539 dpavlin 20
540 dpavlin 23 print $rdoc->uri;
541    
542     =cut
543    
544     sub uri {
545     my $self = shift;
546     return $self->{uri};
547     }
548    
549    
550     =head2 attr_names
551    
552     Returns array with attribute names from result document object.
553    
554     my @attrs = $rdoc->attr_names;
555    
556     =cut
557    
558     sub attr_names {
559     my $self = shift;
560     croak "attr_names return array, not scalar" if (! wantarray);
561     return sort keys %{ $self->{attrs} };
562     }
563    
564 dpavlin 42
565 dpavlin 23 =head2 attr
566    
567     Returns value of an attribute.
568    
569     my $value = $rdoc->attr( 'attribute' );
570    
571     =cut
572    
573     sub attr {
574     my $self = shift;
575     my $name = shift || return;
576     return $self->{attrs}->{ $name };
577     }
578    
579 dpavlin 42
580 dpavlin 23 =head2 snippet
581    
582     Return snippet from result document
583    
584     print $rdoc->snippet;
585    
586     =cut
587    
588     sub snippet {
589     my $self = shift;
590     return $self->{snippet};
591     }
592    
593 dpavlin 42
594 dpavlin 23 =head2 keywords
595    
596     Return keywords from result document
597    
598     print $rdoc->keywords;
599    
600     =cut
601    
602     sub keywords {
603     my $self = shift;
604     return $self->{keywords};
605     }
606    
607    
608 dpavlin 25 package Search::Estraier::NodeResult;
609    
610     use Carp qw/croak/;
611    
612     #use Search::Estraier;
613     #our @ISA = qw/Search::Estraier/;
614    
615     =head1 Search::Estraier::NodeResult
616    
617     =head2 new
618    
619     my $res = new Search::HyperEstraier::NodeResult(
620     docs => @array_of_rdocs,
621     hits => %hash_with_hints,
622     );
623    
624     =cut
625    
626     sub new {
627     my $class = shift;
628     my $self = {@_};
629     bless($self, $class);
630    
631     foreach my $f (qw/docs hints/) {
632     croak "missing $f for ResultDocument" unless defined($self->{$f});
633     }
634    
635     $self ? return $self : return undef;
636     }
637    
638 dpavlin 42
639 dpavlin 25 =head2 doc_num
640    
641     Return number of documents
642    
643     print $res->doc_num;
644    
645     =cut
646    
647     sub doc_num {
648     my $self = shift;
649 dpavlin 53 return $#{$self->{docs}} + 1;
650 dpavlin 25 }
651    
652 dpavlin 42
653 dpavlin 25 =head2 get_doc
654    
655     Return single document
656    
657     my $doc = $res->get_doc( 42 );
658    
659     Returns undef if document doesn't exist.
660    
661     =cut
662    
663     sub get_doc {
664     my $self = shift;
665     my $num = shift;
666 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
667 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
668     return $self->{docs}->[$num];
669     }
670    
671 dpavlin 42
672 dpavlin 25 =head2 hint
673    
674     Return specific hint from results.
675    
676     print $rec->hint( 'VERSION' );
677    
678     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
679     C<TIME>, C<LINK#n>, C<VIEW>.
680    
681     =cut
682    
683     sub hint {
684     my $self = shift;
685     my $key = shift || return;
686     return $self->{hints}->{$key};
687     }
688    
689    
690 dpavlin 27 package Search::Estraier::Node;
691    
692 dpavlin 44 use Carp qw/carp croak confess/;
693 dpavlin 33 use URI;
694 dpavlin 36 use MIME::Base64;
695 dpavlin 33 use IO::Socket::INET;
696 dpavlin 49 use URI::Escape qw/uri_escape/;
697 dpavlin 29
698 dpavlin 27 =head1 Search::Estraier::Node
699    
700     =head2 new
701    
702     my $node = new Search::HyperEstraier::Node;
703    
704     =cut
705    
706     sub new {
707     my $class = shift;
708     my $self = {
709     pxport => -1,
710 dpavlin 33 timeout => 0, # this used to be -1
711 dpavlin 27 dnum => -1,
712     wnum => -1,
713     size => -1.0,
714     wwidth => 480,
715     hwidth => 96,
716     awidth => 96,
717     status => -1,
718     };
719     bless($self, $class);
720    
721 dpavlin 57 my $args = {@_};
722 dpavlin 39
723 dpavlin 57 $self->{debug} = $args->{debug};
724     warn "## Node debug on\n" if ($self->{debug});
725    
726 dpavlin 27 $self ? return $self : return undef;
727     }
728    
729 dpavlin 42
730 dpavlin 29 =head2 set_url
731    
732     Specify URL to node server
733    
734     $node->set_url('http://localhost:1978');
735    
736     =cut
737    
738     sub set_url {
739     my $self = shift;
740     $self->{url} = shift;
741     }
742    
743 dpavlin 42
744 dpavlin 29 =head2 set_proxy
745    
746     Specify proxy server to connect to node server
747    
748     $node->set_proxy('proxy.example.com', 8080);
749    
750     =cut
751    
752     sub set_proxy {
753     my $self = shift;
754     my ($host,$port) = @_;
755 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
756 dpavlin 29 $self->{pxhost} = $host;
757     $self->{pxport} = $port;
758     }
759    
760 dpavlin 42
761 dpavlin 30 =head2 set_timeout
762    
763     Specify timeout of connection in seconds
764    
765     $node->set_timeout( 15 );
766    
767     =cut
768    
769     sub set_timeout {
770     my $self = shift;
771     my $sec = shift;
772 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
773 dpavlin 30 $self->{timeout} = $sec;
774     }
775    
776 dpavlin 42
777 dpavlin 31 =head2 set_auth
778    
779     Specify name and password for authentication to node server.
780    
781     $node->set_auth('clint','eastwood');
782    
783     =cut
784    
785     sub set_auth {
786     my $self = shift;
787     my ($login,$passwd) = @_;
788 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
789     chomp($basic_auth);
790     $self->{auth} = $basic_auth;
791 dpavlin 31 }
792    
793 dpavlin 42
794 dpavlin 32 =head2 status
795    
796     Return status code of last request.
797    
798 dpavlin 40 print $node->status;
799 dpavlin 32
800     C<-1> means connection failure.
801    
802     =cut
803    
804     sub status {
805     my $self = shift;
806     return $self->{status};
807     }
808    
809 dpavlin 42
810 dpavlin 40 =head2 put_doc
811    
812 dpavlin 41 Add a document
813 dpavlin 40
814 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
815    
816     Return true on success or false on failture.
817    
818 dpavlin 40 =cut
819    
820     sub put_doc {
821     my $self = shift;
822     my $doc = shift || return;
823 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
824 dpavlin 41 $self->shuttle_url( $self->{url} . '/put_doc',
825     'text/x-estraier-draft',
826     $doc->dump_draft,
827     undef
828     ) == 200;
829 dpavlin 40 }
830    
831 dpavlin 41
832     =head2 out_doc
833    
834     Remove a document
835    
836     $node->out_doc( document_id ) or "can't remove document";
837    
838     Return true on success or false on failture.
839    
840     =cut
841    
842     sub out_doc {
843     my $self = shift;
844     my $id = shift || return;
845     return unless ($self->{url});
846 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
847 dpavlin 41 $self->shuttle_url( $self->{url} . '/out_doc',
848     'application/x-www-form-urlencoded',
849     "id=$id",
850     undef
851     ) == 200;
852     }
853    
854    
855     =head2 out_doc_by_uri
856    
857     Remove a registrated document using it's uri
858    
859 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
860 dpavlin 41
861     Return true on success or false on failture.
862    
863     =cut
864    
865     sub out_doc_by_uri {
866     my $self = shift;
867     my $uri = shift || return;
868     return unless ($self->{url});
869     $self->shuttle_url( $self->{url} . '/out_doc',
870     'application/x-www-form-urlencoded',
871 dpavlin 50 "uri=" . uri_escape($uri),
872 dpavlin 41 undef
873     ) == 200;
874     }
875    
876 dpavlin 42
877     =head2 edit_doc
878    
879     Edit attributes of a document
880    
881     $node->edit_doc( $document_draft ) or die "can't edit document";
882    
883     Return true on success or false on failture.
884    
885     =cut
886    
887     sub edit_doc {
888     my $self = shift;
889     my $doc = shift || return;
890 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
891 dpavlin 42 $self->shuttle_url( $self->{url} . '/edit_doc',
892     'text/x-estraier-draft',
893     $doc->dump_draft,
894     undef
895     ) == 200;
896     }
897    
898    
899 dpavlin 43 =head2 get_doc
900    
901     Retreive document
902    
903     my $doc = $node->get_doc( document_id ) or die "can't get document";
904    
905     Return true on success or false on failture.
906    
907     =cut
908    
909     sub get_doc {
910     my $self = shift;
911     my $id = shift || return;
912     return $self->_fetch_doc( id => $id );
913     }
914    
915 dpavlin 44
916 dpavlin 43 =head2 get_doc_by_uri
917    
918     Retreive document
919    
920 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
921 dpavlin 43
922     Return true on success or false on failture.
923    
924     =cut
925    
926     sub get_doc_by_uri {
927     my $self = shift;
928     my $uri = shift || return;
929     return $self->_fetch_doc( uri => $uri );
930     }
931    
932 dpavlin 44
933 dpavlin 49 =head2 get_doc_attr
934    
935     Retrieve the value of an atribute from object
936    
937     my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
938     die "can't get document attribute";
939    
940     =cut
941    
942     sub get_doc_attr {
943     my $self = shift;
944     my ($id,$name) = @_;
945     return unless ($id && $name);
946     return $self->_fetch_doc( id => $id, attr => $name );
947     }
948    
949    
950     =head2 get_doc_attr_by_uri
951    
952     Retrieve the value of an atribute from object
953    
954     my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
955     die "can't get document attribute";
956    
957     =cut
958    
959     sub get_doc_attr_by_uri {
960     my $self = shift;
961     my ($uri,$name) = @_;
962     return unless ($uri && $name);
963     return $self->_fetch_doc( uri => $uri, attr => $name );
964     }
965    
966    
967 dpavlin 44 =head2 etch_doc
968    
969     Exctract document keywords
970    
971     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
972    
973     =cut
974    
975 dpavlin 49 sub etch_doc {
976 dpavlin 44 my $self = shift;
977     my $id = shift || return;
978     return $self->_fetch_doc( id => $id, etch => 1 );
979     }
980    
981     =head2 etch_doc_by_uri
982    
983     Retreive document
984    
985 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
986 dpavlin 44
987     Return true on success or false on failture.
988    
989     =cut
990    
991     sub etch_doc_by_uri {
992     my $self = shift;
993     my $uri = shift || return;
994     return $self->_fetch_doc( uri => $uri, etch => 1 );
995     }
996    
997    
998 dpavlin 45 =head2 uri_to_id
999    
1000     Get ID of document specified by URI
1001    
1002     my $id = $node->uri_to_id( 'file:///document/uri/42' );
1003    
1004     =cut
1005    
1006     sub uri_to_id {
1007     my $self = shift;
1008     my $uri = shift || return;
1009     return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1010     }
1011    
1012    
1013 dpavlin 43 =head2 _fetch_doc
1014    
1015 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1016     C<etch_doc>, C<etch_doc_by_uri>.
1017 dpavlin 43
1018 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
1019     my $doc = $node->_fetch_doc( id => 42 );
1020     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1021 dpavlin 43
1022 dpavlin 45 # to extract keywords, add etch
1023     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1024     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1025    
1026 dpavlin 49 # to get document attrubute add attr
1027     my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1028     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1029    
1030 dpavlin 45 # more general form which allows implementation of
1031     # uri_to_id
1032     my $id = $node->_fetch_doc(
1033     uri => 'file:///document/uri/42',
1034     path => '/uri_to_id',
1035     chomp_resbody => 1
1036     );
1037    
1038 dpavlin 43 =cut
1039    
1040     sub _fetch_doc {
1041     my $self = shift;
1042 dpavlin 44 my $a = {@_};
1043     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1044    
1045     my ($arg, $resbody);
1046    
1047 dpavlin 45 my $path = $a->{path} || '/get_doc';
1048 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1049    
1050     if ($a->{id}) {
1051     croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1052     $arg = 'id=' . $a->{id};
1053     } elsif ($a->{uri}) {
1054 dpavlin 50 $arg = 'uri=' . uri_escape($a->{uri});
1055 dpavlin 44 } else {
1056     confess "unhandled argument. Need id or uri.";
1057 dpavlin 43 }
1058 dpavlin 44
1059 dpavlin 49 if ($a->{attr}) {
1060     $path = '/get_doc_attr';
1061     $arg .= '&attr=' . uri_escape($a->{attr});
1062     $a->{chomp_resbody} = 1;
1063     }
1064    
1065 dpavlin 44 my $rv = $self->shuttle_url( $self->{url} . $path,
1066 dpavlin 43 'application/x-www-form-urlencoded',
1067 dpavlin 44 $arg,
1068 dpavlin 45 \$resbody,
1069 dpavlin 43 );
1070 dpavlin 44
1071 dpavlin 43 return if ($rv != 200);
1072 dpavlin 44
1073     if ($a->{etch}) {
1074     $self->{kwords} = {};
1075     return +{} unless ($resbody);
1076     foreach my $l (split(/\n/, $resbody)) {
1077     my ($k,$v) = split(/\t/, $l, 2);
1078     $self->{kwords}->{$k} = $v if ($v);
1079     }
1080     return $self->{kwords};
1081 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1082     return unless (defined($resbody));
1083     chomp($resbody);
1084     return $resbody;
1085 dpavlin 44 } else {
1086     return new Search::Estraier::Document($resbody);
1087     }
1088 dpavlin 43 }
1089    
1090    
1091 dpavlin 48 =head2 name
1092 dpavlin 43
1093 dpavlin 48 my $node_name = $node->name;
1094 dpavlin 43
1095 dpavlin 48 =cut
1096    
1097     sub name {
1098     my $self = shift;
1099 dpavlin 55 $self->_set_info unless ($self->{name});
1100 dpavlin 48 return $self->{name};
1101     }
1102    
1103    
1104     =head2 label
1105    
1106     my $node_label = $node->label;
1107    
1108     =cut
1109    
1110     sub label {
1111     my $self = shift;
1112 dpavlin 55 $self->_set_info unless ($self->{label});
1113 dpavlin 48 return $self->{label};
1114     }
1115    
1116    
1117     =head2 doc_num
1118    
1119     my $documents_in_node = $node->doc_num;
1120    
1121     =cut
1122    
1123     sub doc_num {
1124     my $self = shift;
1125 dpavlin 55 $self->_set_info if ($self->{dnum} < 0);
1126 dpavlin 48 return $self->{dnum};
1127     }
1128    
1129    
1130     =head2 word_num
1131    
1132     my $words_in_node = $node->word_num;
1133    
1134     =cut
1135    
1136     sub word_num {
1137     my $self = shift;
1138 dpavlin 55 $self->_set_info if ($self->{wnum} < 0);
1139 dpavlin 48 return $self->{wnum};
1140     }
1141    
1142    
1143     =head2 size
1144    
1145     my $node_size = $node->size;
1146    
1147     =cut
1148    
1149     sub size {
1150     my $self = shift;
1151 dpavlin 55 $self->_set_info if ($self->{size} < 0);
1152 dpavlin 48 return $self->{size};
1153     }
1154    
1155    
1156 dpavlin 51 =head2 search
1157 dpavlin 48
1158 dpavlin 51 Search documents which match condition
1159    
1160     my $nres = $node->search( $cond, $depth );
1161    
1162     C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1163     depth for meta search.
1164    
1165     Function results C<Search::Estraier::NodeResult> object.
1166    
1167     =cut
1168    
1169     sub search {
1170     my $self = shift;
1171     my ($cond, $depth) = @_;
1172     return unless ($cond && defined($depth) && $self->{url});
1173     croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1174     croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1175    
1176 dpavlin 52 my $resbody;
1177 dpavlin 51
1178 dpavlin 52 my $rv = $self->shuttle_url( $self->{url} . '/search',
1179 dpavlin 53 'application/x-www-form-urlencoded',
1180 dpavlin 61 $self->cond_to_query( $cond, $depth ),
1181 dpavlin 52 \$resbody,
1182     );
1183     return if ($rv != 200);
1184    
1185     my (@docs, $hints);
1186    
1187     my @lines = split(/\n/, $resbody);
1188     return unless (@lines);
1189    
1190     my $border = $lines[0];
1191     my $isend = 0;
1192     my $lnum = 1;
1193    
1194     while ( $lnum <= $#lines ) {
1195     my $line = $lines[$lnum];
1196     $lnum++;
1197    
1198     #warn "## $line\n";
1199     if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1200     $isend = $1;
1201     last;
1202     }
1203    
1204     if ($line =~ /\t/) {
1205     my ($k,$v) = split(/\t/, $line, 2);
1206     $hints->{$k} = $v;
1207     }
1208     }
1209    
1210     my $snum = $lnum;
1211    
1212     while( ! $isend && $lnum <= $#lines ) {
1213     my $line = $lines[$lnum];
1214 dpavlin 53 #warn "# $lnum: $line\n";
1215 dpavlin 52 $lnum++;
1216    
1217     if ($line && $line =~ m/^\Q$border\E/) {
1218     if ($lnum > $snum) {
1219     my $rdattrs;
1220     my $rdvector;
1221     my $rdsnippet;
1222    
1223     my $rlnum = $snum;
1224     while ($rlnum < $lnum - 1 ) {
1225     #my $rdline = $self->_s($lines[$rlnum]);
1226     my $rdline = $lines[$rlnum];
1227     $rlnum++;
1228     last unless ($rdline);
1229     if ($rdline =~ /^%/) {
1230     $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1231 dpavlin 53 } elsif($rdline =~ /=/) {
1232     $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1233 dpavlin 52 } else {
1234 dpavlin 53 confess "invalid format of response";
1235 dpavlin 52 }
1236     }
1237     while($rlnum < $lnum - 1) {
1238     my $rdline = $lines[$rlnum];
1239     $rlnum++;
1240     $rdsnippet .= "$rdline\n";
1241     }
1242 dpavlin 53 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1243 dpavlin 52 if (my $rduri = $rdattrs->{'@uri'}) {
1244     push @docs, new Search::Estraier::ResultDocument(
1245     uri => $rduri,
1246     attrs => $rdattrs,
1247     snippet => $rdsnippet,
1248     keywords => $rdvector,
1249     );
1250     }
1251     }
1252     $snum = $lnum;
1253     #warn "### $line\n";
1254     $isend = 1 if ($line =~ /:END$/);
1255     }
1256    
1257     }
1258    
1259     if (! $isend) {
1260     warn "received result doesn't have :END\n$resbody";
1261     return;
1262     }
1263    
1264 dpavlin 53 #warn Dumper(\@docs, $hints);
1265    
1266 dpavlin 52 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1267 dpavlin 51 }
1268    
1269    
1270     =head2 cond_to_query
1271    
1272 dpavlin 55 Return URI encoded string generated from Search::Estraier::Condition
1273    
1274 dpavlin 61 my $args = $node->cond_to_query( $cond, $depth );
1275 dpavlin 51
1276     =cut
1277    
1278     sub cond_to_query {
1279     my $self = shift;
1280    
1281     my $cond = shift || return;
1282     croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1283 dpavlin 61 my $depth = shift;
1284 dpavlin 51
1285     my @args;
1286    
1287     if (my $phrase = $cond->phrase) {
1288     push @args, 'phrase=' . uri_escape($phrase);
1289     }
1290    
1291     if (my @attrs = $cond->attrs) {
1292     for my $i ( 0 .. $#attrs ) {
1293     push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );
1294     }
1295     }
1296    
1297     if (my $order = $cond->order) {
1298     push @args, 'order=' . uri_escape($order);
1299     }
1300    
1301     if (my $max = $cond->max) {
1302     push @args, 'max=' . $max;
1303     } else {
1304     push @args, 'max=' . (1 << 30);
1305     }
1306    
1307     if (my $options = $cond->options) {
1308     push @args, 'options=' . $options;
1309     }
1310    
1311 dpavlin 61 push @args, 'depth=' . $depth if ($depth);
1312 dpavlin 51 push @args, 'wwidth=' . $self->{wwidth};
1313     push @args, 'hwidth=' . $self->{hwidth};
1314     push @args, 'awidth=' . $self->{awidth};
1315    
1316     return join('&', @args);
1317     }
1318    
1319    
1320 dpavlin 33 =head2 shuttle_url
1321 dpavlin 32
1322 dpavlin 33 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1323     master.
1324 dpavlin 2
1325 dpavlin 52 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1326 dpavlin 2
1327 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1328     body will be saved within object.
1329 dpavlin 2
1330     =cut
1331    
1332 dpavlin 59 use LWP::UserAgent;
1333    
1334 dpavlin 33 sub shuttle_url {
1335     my $self = shift;
1336 dpavlin 2
1337 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
1338 dpavlin 2
1339 dpavlin 40 $self->{status} = -1;
1340 dpavlin 33
1341 dpavlin 41 warn "## $url\n" if ($self->{debug});
1342 dpavlin 36
1343 dpavlin 33 $url = new URI($url);
1344 dpavlin 37 if (
1345     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1346     !$url->host || !$url->port || $url->port < 1
1347     ) {
1348     carp "can't parse $url\n";
1349     return -1;
1350     }
1351 dpavlin 33
1352 dpavlin 59 my $ua = LWP::UserAgent->new;
1353     $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1354 dpavlin 33
1355 dpavlin 59 my $req;
1356 dpavlin 37 if ($reqbody) {
1357 dpavlin 59 $req = HTTP::Request->new(POST => $url);
1358 dpavlin 37 } else {
1359 dpavlin 59 $req = HTTP::Request->new(GET => $url);
1360 dpavlin 37 }
1361    
1362 dpavlin 59 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1363     $req->headers->header( 'Connection', 'close' );
1364     $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );
1365     $req->content_type( $content_type );
1366 dpavlin 37
1367 dpavlin 59 warn $req->headers->as_string,"\n" if ($self->{debug});
1368 dpavlin 2
1369 dpavlin 37 if ($reqbody) {
1370 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1371 dpavlin 59 $req->content( $reqbody );
1372 dpavlin 33 }
1373 dpavlin 2
1374 dpavlin 59 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1375 dpavlin 2
1376 dpavlin 59 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1377 dpavlin 2
1378 dpavlin 59 return -1 if (! $res->is_success);
1379 dpavlin 2
1380 dpavlin 59 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1381 dpavlin 33
1382 dpavlin 59 $$resbody .= $res->content;
1383    
1384 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1385 dpavlin 39
1386 dpavlin 40 return $self->{status};
1387 dpavlin 2 }
1388    
1389 dpavlin 48
1390 dpavlin 55 =head2 set_snippet_width
1391 dpavlin 48
1392 dpavlin 55 Set width of snippets in results
1393    
1394     $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1395    
1396     C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1397     is not sent with results. If it is negative, whole document text is sent instead of snippet.
1398    
1399     C<$hwidth> specified width of strings from beginning of string. Default
1400     value is C<96>. Negative or zero value keep previous value.
1401    
1402     C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1403     If negative of zero value is provided previous value is kept unchanged.
1404    
1405     =cut
1406    
1407     sub set_snippet_width {
1408     my $self = shift;
1409    
1410     my ($wwidth, $hwidth, $awidth) = @_;
1411     $self->{wwidth} = $wwidth;
1412     $self->{hwidth} = $hwidth if ($hwidth >= 0);
1413     $self->{awidth} = $awidth if ($awidth >= 0);
1414     }
1415    
1416    
1417 dpavlin 56 =head2 set_user
1418 dpavlin 55
1419 dpavlin 56 Manage users of node
1420    
1421     $node->set_user( 'name', $mode );
1422    
1423     C<$mode> can be one of:
1424    
1425     =over 4
1426    
1427     =item 0
1428    
1429     delete account
1430    
1431     =item 1
1432    
1433     set administrative right for user
1434    
1435     =item 2
1436    
1437     set user account as guest
1438    
1439     =back
1440    
1441     Return true on success, otherwise false.
1442    
1443     =cut
1444    
1445     sub set_user {
1446     my $self = shift;
1447     my ($name, $mode) = @_;
1448    
1449     return unless ($self->{url});
1450     croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1451    
1452     $self->shuttle_url( $self->{url} . '/_set_user',
1453     'text/plain',
1454     'name=' . uri_escape($name) . '&mode=' . $mode,
1455     undef
1456     ) == 200;
1457     }
1458    
1459    
1460 dpavlin 57 =head2 set_link
1461    
1462     Manage node links
1463    
1464     $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1465    
1466     If C<$credit> is negative, link is removed.
1467    
1468     =cut
1469    
1470     sub set_link {
1471     my $self = shift;
1472     my ($url, $label, $credit) = @_;
1473    
1474     return unless ($self->{url});
1475     croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1476    
1477     my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1478     $reqbody .= '&credit=' . $credit if ($credit > 0);
1479    
1480     $self->shuttle_url( $self->{url} . '/_set_link',
1481     'text/plain',
1482     $reqbody,
1483     undef
1484     ) == 200;
1485     }
1486    
1487    
1488 dpavlin 55 =head1 PRIVATE METHODS
1489    
1490     You could call those directly, but you don't have to. I hope.
1491    
1492     =head2 _set_info
1493    
1494 dpavlin 48 Set information for node
1495    
1496 dpavlin 55 $node->_set_info;
1497 dpavlin 48
1498     =cut
1499    
1500 dpavlin 55 sub _set_info {
1501 dpavlin 48 my $self = shift;
1502    
1503     $self->{status} = -1;
1504     return unless ($self->{url});
1505    
1506     my $resbody;
1507     my $rv = $self->shuttle_url( $self->{url} . '/inform',
1508     'text/plain',
1509     undef,
1510     \$resbody,
1511     );
1512    
1513     return if ($rv != 200 || !$resbody);
1514    
1515 dpavlin 58 # it seems that response can have multiple line endings
1516     $resbody =~ s/[\r\n]+$//;
1517 dpavlin 48
1518     ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1519     split(/\t/, $resbody, 5);
1520    
1521     }
1522    
1523 dpavlin 2 ###
1524    
1525     =head1 EXPORT
1526    
1527     Nothing.
1528    
1529     =head1 SEE ALSO
1530    
1531     L<http://hyperestraier.sourceforge.net/>
1532    
1533     Hyper Estraier Ruby interface on which this module is based.
1534    
1535     =head1 AUTHOR
1536    
1537     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1538    
1539    
1540     =head1 COPYRIGHT AND LICENSE
1541    
1542 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1543 dpavlin 2
1544     This library is free software; you can redistribute it and/or modify
1545     it under the GPL v2 or later.
1546    
1547     =cut
1548    
1549     1;

  ViewVC Help
Powered by ViewVC 1.1.26