/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 68 - (hide annotations)
Sun Jan 8 00:13:09 2006 UTC (18 years, 2 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 28774 byte(s)
two simple examples included under synopsis in documentation
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 65 our $VERSION = '0.02';
8 dpavlin 2
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15 dpavlin 68 =head2 Simple indexer
16 dpavlin 2
17 dpavlin 68 use Search::Estraier;
18    
19     # create and configure node
20     my $node = new Search::Estraier::Node;
21     $node->set_url("http://localhost:1978/node/test");
22     $node->set_auth("admin","admin");
23    
24     # create document
25     my $doc = new Search::Estraier::Document;
26    
27     # add attributes
28     $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29     $doc->add_attr('@title', "Over the Rainbow");
30    
31     # add body text to document
32     $doc->add_text("Somewhere over the rainbow. Way up high.");
33     $doc->add_text("There's a land that I heard of once in a lullaby.");
34    
35     die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36    
37     =head2 Simple searcher
38    
39     use Search::Estraier;
40    
41     # create and configure node
42     my $node = new Search::Estraier::Node;
43     $node->set_url("http://localhost:1978/node/test");
44     $node->set_auth("admin","admin");
45    
46     # create condition
47     my $cond = new Search::Estraier::Condition;
48    
49     # set search phrase
50     $cond->set_phrase("rainbow AND lullaby");
51    
52     my $nres = $node->search($cond, 0);
53     if (defined($nres)) {
54     # for each document in results
55     for my $i ( 0 ... $nres->doc_num - 1 ) {
56     # get result document
57     my $rdoc = $nres->get_doc($i);
58     # display attribte
59     print "URI: ", $rdoc->attr('@uri'),"\n";
60     print "Title: ", $rdoc->attr('@title'),"\n";
61     print $rdoc->snippet,"\n";
62     }
63     } else {
64     die "error: ", $node->status,"\n";
65     }
66    
67 dpavlin 2 =head1 DESCRIPTION
68    
69     This module is implementation of node API of Hyper Estraier. Since it's
70     perl-only module with dependencies only on standard perl modules, it will
71     run on all platforms on which perl runs. It doesn't require compilation
72     or Hyper Estraier development files on target machine.
73    
74     It is implemented as multiple packages which closly resamble Ruby
75     implementation. It also includes methods to manage nodes.
76    
77 dpavlin 68 There are few examples in C<scripts> directory of this distribution.
78    
79 dpavlin 2 =cut
80    
81 dpavlin 42 =head1 Inheritable common methods
82    
83     This methods should really move somewhere else.
84    
85 dpavlin 15 =head2 _s
86    
87     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88    
89     my $text = $self->_s(" this is a text ");
90     $text = 'this is a text';
91    
92     =cut
93    
94     sub _s {
95     my $text = $_[1] || return;
96     $text =~ s/\s\s+/ /gs;
97     $text =~ s/^\s+//;
98     $text =~ s/\s+$//;
99     return $text;
100     }
101    
102 dpavlin 2 package Search::Estraier::Document;
103    
104 dpavlin 9 use Carp qw/croak confess/;
105 dpavlin 7
106 dpavlin 15 use Search::Estraier;
107     our @ISA = qw/Search::Estraier/;
108    
109 dpavlin 2 =head1 Search::Estraier::Document
110    
111 dpavlin 14 This class implements Document which is collection of attributes
112     (key=value), vectors (also key value) display text and hidden text.
113    
114 dpavlin 42
115 dpavlin 2 =head2 new
116    
117 dpavlin 14 Create new document, empty or from draft.
118    
119 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
120 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
121 dpavlin 2
122     =cut
123    
124     sub new {
125     my $class = shift;
126 dpavlin 14 my $self = {};
127 dpavlin 2 bless($self, $class);
128    
129 dpavlin 6 $self->{id} = -1;
130    
131 dpavlin 14 my $draft = shift;
132    
133     if ($draft) {
134     my $in_text = 0;
135     foreach my $line (split(/\n/, $draft)) {
136    
137     if ($in_text) {
138     if ($line =~ /^\t/) {
139     push @{ $self->{htexts} }, substr($line, 1);
140     } else {
141     push @{ $self->{dtexts} }, $line;
142     }
143     next;
144     }
145    
146     if ($line =~ m/^%VECTOR\t(.+)$/) {
147     my @fields = split(/\t/, $1);
148     for my $i ( 0 .. ($#fields - 1) ) {
149     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
150     $i++;
151     }
152     next;
153     } elsif ($line =~ m/^%/) {
154     # What is this? comment?
155     #warn "$line\n";
156     next;
157     } elsif ($line =~ m/^$/) {
158     $in_text = 1;
159     next;
160     } elsif ($line =~ m/^(.+)=(.+)$/) {
161     $self->{attrs}->{ $1 } = $2;
162     next;
163     }
164    
165     warn "draft ignored: $line\n";
166     }
167     }
168    
169 dpavlin 2 $self ? return $self : return undef;
170     }
171    
172 dpavlin 4
173 dpavlin 2 =head2 add_attr
174    
175 dpavlin 6 Add an attribute.
176    
177 dpavlin 2 $doc->add_attr( name => 'value' );
178    
179 dpavlin 9 Delete attribute using
180 dpavlin 5
181     $doc->add_attr( name => undef );
182    
183 dpavlin 2 =cut
184    
185     sub add_attr {
186     my $self = shift;
187     my $attrs = {@_};
188    
189     while (my ($name, $value) = each %{ $attrs }) {
190 dpavlin 9 if (! defined($value)) {
191 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
192 dpavlin 9 } else {
193 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
194 dpavlin 9 }
195 dpavlin 2 }
196 dpavlin 8
197     return 1;
198 dpavlin 2 }
199    
200 dpavlin 5
201     =head2 add_text
202    
203 dpavlin 6 Add a sentence of text.
204    
205 dpavlin 5 $doc->add_text('this is example text to display');
206    
207     =cut
208    
209     sub add_text {
210     my $self = shift;
211     my $text = shift;
212     return unless defined($text);
213    
214 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
215 dpavlin 5 }
216    
217    
218     =head2 add_hidden_text
219    
220 dpavlin 6 Add a hidden sentence.
221    
222 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
223    
224     =cut
225    
226     sub add_hidden_text {
227     my $self = shift;
228     my $text = shift;
229     return unless defined($text);
230    
231 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
232 dpavlin 5 }
233    
234 dpavlin 42
235 dpavlin 6 =head2 id
236    
237     Get the ID number of document. If the object has never been registred, C<-1> is returned.
238    
239     print $doc->id;
240    
241     =cut
242    
243     sub id {
244     my $self = shift;
245     return $self->{id};
246     }
247    
248 dpavlin 42
249 dpavlin 7 =head2 attr_names
250    
251 dpavlin 9 Returns array with attribute names from document object.
252 dpavlin 7
253     my @attrs = $doc->attr_names;
254    
255     =cut
256    
257     sub attr_names {
258     my $self = shift;
259 dpavlin 60 return unless ($self->{attrs});
260     #croak "attr_names return array, not scalar" if (! wantarray);
261 dpavlin 7 return sort keys %{ $self->{attrs} };
262     }
263    
264 dpavlin 8
265     =head2 attr
266    
267 dpavlin 9 Returns value of an attribute.
268 dpavlin 8
269     my $value = $doc->attr( 'attribute' );
270    
271     =cut
272    
273     sub attr {
274     my $self = shift;
275     my $name = shift;
276 dpavlin 60 return unless (defined($name) && $self->{attrs});
277     return $self->{attrs}->{ $name };
278 dpavlin 8 }
279    
280 dpavlin 9
281     =head2 texts
282    
283     Returns array with text sentences.
284    
285     my @texts = $doc->texts;
286    
287     =cut
288    
289     sub texts {
290     my $self = shift;
291 dpavlin 60 #confess "texts return array, not scalar" if (! wantarray);
292     return @{ $self->{dtexts} } if ($self->{dtexts});
293 dpavlin 9 }
294    
295 dpavlin 42
296 dpavlin 12 =head2 cat_texts
297    
298     Return whole text as single scalar.
299    
300     my $text = $doc->cat_texts;
301    
302     =cut
303    
304     sub cat_texts {
305     my $self = shift;
306 dpavlin 60 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
307 dpavlin 12 }
308    
309 dpavlin 42
310 dpavlin 5 =head2 dump_draft
311    
312 dpavlin 13 Dump draft data from document object.
313    
314 dpavlin 5 print $doc->dump_draft;
315    
316     =cut
317    
318     sub dump_draft {
319 dpavlin 13 my $self = shift;
320     my $draft;
321    
322     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
323     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
324     }
325    
326     if ($self->{kwords}) {
327     $draft .= '%%VECTOR';
328     while (my ($key, $value) = each %{ $self->{kwords} }) {
329     $draft .= "\t$key\t$value";
330     }
331     $draft .= "\n";
332     }
333    
334     $draft .= "\n";
335    
336 dpavlin 40 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
337     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
338 dpavlin 13
339     return $draft;
340 dpavlin 5 }
341    
342 dpavlin 42
343 dpavlin 4 =head2 delete
344 dpavlin 2
345 dpavlin 4 Empty document object
346 dpavlin 2
347 dpavlin 4 $doc->delete;
348    
349 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
350     convinience. Document objects which go out of scope will be destroyed
351     automatically.
352    
353 dpavlin 4 =cut
354    
355     sub delete {
356     my $self = shift;
357    
358 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
359 dpavlin 5 delete($self->{$data});
360     }
361 dpavlin 4
362 dpavlin 10 $self->{id} = -1;
363    
364 dpavlin 4 return 1;
365     }
366    
367    
368    
369 dpavlin 15 package Search::Estraier::Condition;
370 dpavlin 4
371 dpavlin 16 use Carp qw/confess croak/;
372    
373 dpavlin 15 use Search::Estraier;
374     our @ISA = qw/Search::Estraier/;
375 dpavlin 4
376 dpavlin 16 =head1 Search::Estraier::Condition
377    
378     =head2 new
379    
380     my $cond = new Search::HyperEstraier::Condition;
381    
382     =cut
383    
384     sub new {
385     my $class = shift;
386     my $self = {};
387     bless($self, $class);
388    
389 dpavlin 19 $self->{max} = -1;
390     $self->{options} = 0;
391    
392 dpavlin 16 $self ? return $self : return undef;
393     }
394    
395 dpavlin 42
396 dpavlin 16 =head2 set_phrase
397    
398     $cond->set_phrase('search phrase');
399    
400     =cut
401    
402     sub set_phrase {
403     my $self = shift;
404     $self->{phrase} = $self->_s( shift );
405     }
406    
407 dpavlin 42
408 dpavlin 16 =head2 add_attr
409    
410     $cond->add_attr('@URI STRINC /~dpavlin/');
411    
412     =cut
413    
414     sub add_attr {
415     my $self = shift;
416     my $attr = shift || return;
417     push @{ $self->{attrs} }, $self->_s( $attr );
418     }
419    
420 dpavlin 42
421 dpavlin 16 =head2 set_order
422    
423     $cond->set_order('@mdate NUMD');
424    
425     =cut
426    
427     sub set_order {
428     my $self = shift;
429     $self->{order} = shift;
430     }
431    
432 dpavlin 42
433 dpavlin 16 =head2 set_max
434    
435     $cond->set_max(42);
436    
437     =cut
438    
439     sub set_max {
440     my $self = shift;
441     my $max = shift;
442 dpavlin 43 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
443 dpavlin 16 $self->{max} = $max;
444     }
445    
446 dpavlin 42
447 dpavlin 16 =head2 set_options
448    
449     $cond->set_options( SURE => 1 );
450    
451     =cut
452    
453 dpavlin 15 my $options = {
454     # check N-gram keys skipping by three
455     SURE => 1 << 0,
456     # check N-gram keys skipping by two
457     USUAL => 1 << 1,
458     # without TF-IDF tuning
459     FAST => 1 << 2,
460     # with the simplified phrase
461     AGITO => 1 << 3,
462     # check every N-gram key
463     NOIDF => 1 << 4,
464     # check N-gram keys skipping by one
465     SIMPLE => 1 << 10,
466     };
467    
468 dpavlin 16 sub set_options {
469     my $self = shift;
470     my $option = shift;
471     confess "unknown option" unless ($options->{$option});
472     $self->{options} ||= $options->{$option};
473 dpavlin 4 }
474    
475 dpavlin 42
476 dpavlin 18 =head2 phrase
477    
478     Return search phrase.
479    
480     print $cond->phrase;
481    
482     =cut
483    
484     sub phrase {
485     my $self = shift;
486     return $self->{phrase};
487     }
488    
489 dpavlin 42
490 dpavlin 19 =head2 order
491 dpavlin 18
492 dpavlin 19 Return search result order.
493    
494     print $cond->order;
495    
496     =cut
497    
498     sub order {
499     my $self = shift;
500     return $self->{order};
501     }
502    
503 dpavlin 42
504 dpavlin 19 =head2 attrs
505    
506     Return search result attrs.
507    
508     my @cond_attrs = $cond->attrs;
509    
510     =cut
511    
512     sub attrs {
513     my $self = shift;
514     #croak "attrs return array, not scalar" if (! wantarray);
515 dpavlin 60 return @{ $self->{attrs} } if ($self->{attrs});
516 dpavlin 19 }
517    
518 dpavlin 42
519 dpavlin 19 =head2 max
520    
521     Return maximum number of results.
522    
523     print $cond->max;
524    
525     C<-1> is returned for unitialized value, C<0> is unlimited.
526    
527     =cut
528    
529     sub max {
530     my $self = shift;
531     return $self->{max};
532     }
533    
534 dpavlin 42
535 dpavlin 19 =head2 options
536    
537     Return options for this condition.
538    
539     print $cond->options;
540    
541     Options are returned in numerical form.
542    
543     =cut
544    
545     sub options {
546     my $self = shift;
547     return $self->{options};
548     }
549    
550    
551 dpavlin 20 package Search::Estraier::ResultDocument;
552    
553 dpavlin 24 use Carp qw/croak/;
554 dpavlin 20
555 dpavlin 24 #use Search::Estraier;
556     #our @ISA = qw/Search::Estraier/;
557 dpavlin 20
558     =head1 Search::Estraier::ResultDocument
559    
560     =head2 new
561    
562 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
563 dpavlin 20 uri => 'http://localhost/document/uri/42',
564     attrs => {
565     foo => 1,
566     bar => 2,
567     },
568     snippet => 'this is a text of snippet'
569     keywords => 'this\tare\tkeywords'
570     );
571    
572     =cut
573    
574     sub new {
575     my $class = shift;
576     my $self = {@_};
577     bless($self, $class);
578    
579 dpavlin 62 croak "missing uri for ResultDocument" unless defined($self->{uri});
580 dpavlin 20
581     $self ? return $self : return undef;
582     }
583    
584 dpavlin 42
585 dpavlin 23 =head2 uri
586 dpavlin 20
587 dpavlin 23 Return URI of result document
588 dpavlin 20
589 dpavlin 23 print $rdoc->uri;
590    
591     =cut
592    
593     sub uri {
594     my $self = shift;
595     return $self->{uri};
596     }
597    
598    
599     =head2 attr_names
600    
601     Returns array with attribute names from result document object.
602    
603     my @attrs = $rdoc->attr_names;
604    
605     =cut
606    
607     sub attr_names {
608     my $self = shift;
609     croak "attr_names return array, not scalar" if (! wantarray);
610     return sort keys %{ $self->{attrs} };
611     }
612    
613 dpavlin 42
614 dpavlin 23 =head2 attr
615    
616     Returns value of an attribute.
617    
618     my $value = $rdoc->attr( 'attribute' );
619    
620     =cut
621    
622     sub attr {
623     my $self = shift;
624     my $name = shift || return;
625     return $self->{attrs}->{ $name };
626     }
627    
628 dpavlin 42
629 dpavlin 23 =head2 snippet
630    
631     Return snippet from result document
632    
633     print $rdoc->snippet;
634    
635     =cut
636    
637     sub snippet {
638     my $self = shift;
639     return $self->{snippet};
640     }
641    
642 dpavlin 42
643 dpavlin 23 =head2 keywords
644    
645     Return keywords from result document
646    
647     print $rdoc->keywords;
648    
649     =cut
650    
651     sub keywords {
652     my $self = shift;
653     return $self->{keywords};
654     }
655    
656    
657 dpavlin 25 package Search::Estraier::NodeResult;
658    
659     use Carp qw/croak/;
660    
661     #use Search::Estraier;
662     #our @ISA = qw/Search::Estraier/;
663    
664     =head1 Search::Estraier::NodeResult
665    
666     =head2 new
667    
668     my $res = new Search::HyperEstraier::NodeResult(
669     docs => @array_of_rdocs,
670     hits => %hash_with_hints,
671     );
672    
673     =cut
674    
675     sub new {
676     my $class = shift;
677     my $self = {@_};
678     bless($self, $class);
679    
680     foreach my $f (qw/docs hints/) {
681     croak "missing $f for ResultDocument" unless defined($self->{$f});
682     }
683    
684     $self ? return $self : return undef;
685     }
686    
687 dpavlin 42
688 dpavlin 25 =head2 doc_num
689    
690     Return number of documents
691    
692     print $res->doc_num;
693    
694     =cut
695    
696     sub doc_num {
697     my $self = shift;
698 dpavlin 53 return $#{$self->{docs}} + 1;
699 dpavlin 25 }
700    
701 dpavlin 42
702 dpavlin 25 =head2 get_doc
703    
704     Return single document
705    
706     my $doc = $res->get_doc( 42 );
707    
708     Returns undef if document doesn't exist.
709    
710     =cut
711    
712     sub get_doc {
713     my $self = shift;
714     my $num = shift;
715 dpavlin 43 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
716 dpavlin 25 return undef if ($num < 0 || $num > $self->{docs});
717     return $self->{docs}->[$num];
718     }
719    
720 dpavlin 42
721 dpavlin 25 =head2 hint
722    
723     Return specific hint from results.
724    
725     print $rec->hint( 'VERSION' );
726    
727     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
728     C<TIME>, C<LINK#n>, C<VIEW>.
729    
730     =cut
731    
732     sub hint {
733     my $self = shift;
734     my $key = shift || return;
735     return $self->{hints}->{$key};
736     }
737    
738    
739 dpavlin 27 package Search::Estraier::Node;
740    
741 dpavlin 44 use Carp qw/carp croak confess/;
742 dpavlin 33 use URI;
743 dpavlin 36 use MIME::Base64;
744 dpavlin 33 use IO::Socket::INET;
745 dpavlin 49 use URI::Escape qw/uri_escape/;
746 dpavlin 29
747 dpavlin 27 =head1 Search::Estraier::Node
748    
749     =head2 new
750    
751     my $node = new Search::HyperEstraier::Node;
752    
753 dpavlin 65 or optionally with C<url> as parametar
754    
755     my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
756    
757 dpavlin 27 =cut
758    
759     sub new {
760     my $class = shift;
761     my $self = {
762     pxport => -1,
763 dpavlin 33 timeout => 0, # this used to be -1
764 dpavlin 27 dnum => -1,
765     wnum => -1,
766     size => -1.0,
767     wwidth => 480,
768     hwidth => 96,
769     awidth => 96,
770     status => -1,
771     };
772     bless($self, $class);
773    
774 dpavlin 65 if ($#_ == 0) {
775     $self->{url} = shift;
776     } else {
777     my $args = {@_};
778 dpavlin 39
779 dpavlin 65 $self->{debug} = $args->{debug};
780     warn "## Node debug on\n" if ($self->{debug});
781     }
782 dpavlin 57
783 dpavlin 27 $self ? return $self : return undef;
784     }
785    
786 dpavlin 42
787 dpavlin 29 =head2 set_url
788    
789     Specify URL to node server
790    
791     $node->set_url('http://localhost:1978');
792    
793     =cut
794    
795     sub set_url {
796     my $self = shift;
797     $self->{url} = shift;
798     }
799    
800 dpavlin 42
801 dpavlin 29 =head2 set_proxy
802    
803     Specify proxy server to connect to node server
804    
805     $node->set_proxy('proxy.example.com', 8080);
806    
807     =cut
808    
809     sub set_proxy {
810     my $self = shift;
811     my ($host,$port) = @_;
812 dpavlin 43 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
813 dpavlin 29 $self->{pxhost} = $host;
814     $self->{pxport} = $port;
815     }
816    
817 dpavlin 42
818 dpavlin 30 =head2 set_timeout
819    
820     Specify timeout of connection in seconds
821    
822     $node->set_timeout( 15 );
823    
824     =cut
825    
826     sub set_timeout {
827     my $self = shift;
828     my $sec = shift;
829 dpavlin 43 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
830 dpavlin 30 $self->{timeout} = $sec;
831     }
832    
833 dpavlin 42
834 dpavlin 31 =head2 set_auth
835    
836     Specify name and password for authentication to node server.
837    
838     $node->set_auth('clint','eastwood');
839    
840     =cut
841    
842     sub set_auth {
843     my $self = shift;
844     my ($login,$passwd) = @_;
845 dpavlin 40 my $basic_auth = encode_base64( "$login:$passwd" );
846     chomp($basic_auth);
847     $self->{auth} = $basic_auth;
848 dpavlin 31 }
849    
850 dpavlin 42
851 dpavlin 32 =head2 status
852    
853     Return status code of last request.
854    
855 dpavlin 40 print $node->status;
856 dpavlin 32
857     C<-1> means connection failure.
858    
859     =cut
860    
861     sub status {
862     my $self = shift;
863     return $self->{status};
864     }
865    
866 dpavlin 42
867 dpavlin 40 =head2 put_doc
868    
869 dpavlin 41 Add a document
870 dpavlin 40
871 dpavlin 41 $node->put_doc( $document_draft ) or die "can't add document";
872    
873     Return true on success or false on failture.
874    
875 dpavlin 40 =cut
876    
877     sub put_doc {
878     my $self = shift;
879     my $doc = shift || return;
880 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
881 dpavlin 41 $self->shuttle_url( $self->{url} . '/put_doc',
882     'text/x-estraier-draft',
883     $doc->dump_draft,
884     undef
885     ) == 200;
886 dpavlin 40 }
887    
888 dpavlin 41
889     =head2 out_doc
890    
891     Remove a document
892    
893     $node->out_doc( document_id ) or "can't remove document";
894    
895     Return true on success or false on failture.
896    
897     =cut
898    
899     sub out_doc {
900     my $self = shift;
901     my $id = shift || return;
902     return unless ($self->{url});
903 dpavlin 43 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
904 dpavlin 41 $self->shuttle_url( $self->{url} . '/out_doc',
905     'application/x-www-form-urlencoded',
906     "id=$id",
907     undef
908     ) == 200;
909     }
910    
911    
912     =head2 out_doc_by_uri
913    
914     Remove a registrated document using it's uri
915    
916 dpavlin 45 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
917 dpavlin 41
918     Return true on success or false on failture.
919    
920     =cut
921    
922     sub out_doc_by_uri {
923     my $self = shift;
924     my $uri = shift || return;
925     return unless ($self->{url});
926     $self->shuttle_url( $self->{url} . '/out_doc',
927     'application/x-www-form-urlencoded',
928 dpavlin 50 "uri=" . uri_escape($uri),
929 dpavlin 41 undef
930     ) == 200;
931     }
932    
933 dpavlin 42
934     =head2 edit_doc
935    
936     Edit attributes of a document
937    
938     $node->edit_doc( $document_draft ) or die "can't edit document";
939    
940     Return true on success or false on failture.
941    
942     =cut
943    
944     sub edit_doc {
945     my $self = shift;
946     my $doc = shift || return;
947 dpavlin 47 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
948 dpavlin 42 $self->shuttle_url( $self->{url} . '/edit_doc',
949     'text/x-estraier-draft',
950     $doc->dump_draft,
951     undef
952     ) == 200;
953     }
954    
955    
956 dpavlin 43 =head2 get_doc
957    
958     Retreive document
959    
960     my $doc = $node->get_doc( document_id ) or die "can't get document";
961    
962     Return true on success or false on failture.
963    
964     =cut
965    
966     sub get_doc {
967     my $self = shift;
968     my $id = shift || return;
969     return $self->_fetch_doc( id => $id );
970     }
971    
972 dpavlin 44
973 dpavlin 43 =head2 get_doc_by_uri
974    
975     Retreive document
976    
977 dpavlin 45 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
978 dpavlin 43
979     Return true on success or false on failture.
980    
981     =cut
982    
983     sub get_doc_by_uri {
984     my $self = shift;
985     my $uri = shift || return;
986     return $self->_fetch_doc( uri => $uri );
987     }
988    
989 dpavlin 44
990 dpavlin 49 =head2 get_doc_attr
991    
992     Retrieve the value of an atribute from object
993    
994     my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
995     die "can't get document attribute";
996    
997     =cut
998    
999     sub get_doc_attr {
1000     my $self = shift;
1001     my ($id,$name) = @_;
1002     return unless ($id && $name);
1003     return $self->_fetch_doc( id => $id, attr => $name );
1004     }
1005    
1006    
1007     =head2 get_doc_attr_by_uri
1008    
1009     Retrieve the value of an atribute from object
1010    
1011     my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1012     die "can't get document attribute";
1013    
1014     =cut
1015    
1016     sub get_doc_attr_by_uri {
1017     my $self = shift;
1018     my ($uri,$name) = @_;
1019     return unless ($uri && $name);
1020     return $self->_fetch_doc( uri => $uri, attr => $name );
1021     }
1022    
1023    
1024 dpavlin 44 =head2 etch_doc
1025    
1026     Exctract document keywords
1027    
1028     my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1029    
1030     =cut
1031    
1032 dpavlin 49 sub etch_doc {
1033 dpavlin 44 my $self = shift;
1034     my $id = shift || return;
1035     return $self->_fetch_doc( id => $id, etch => 1 );
1036     }
1037    
1038     =head2 etch_doc_by_uri
1039    
1040     Retreive document
1041    
1042 dpavlin 45 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1043 dpavlin 44
1044     Return true on success or false on failture.
1045    
1046     =cut
1047    
1048     sub etch_doc_by_uri {
1049     my $self = shift;
1050     my $uri = shift || return;
1051     return $self->_fetch_doc( uri => $uri, etch => 1 );
1052     }
1053    
1054    
1055 dpavlin 45 =head2 uri_to_id
1056    
1057     Get ID of document specified by URI
1058    
1059     my $id = $node->uri_to_id( 'file:///document/uri/42' );
1060    
1061     =cut
1062    
1063     sub uri_to_id {
1064     my $self = shift;
1065     my $uri = shift || return;
1066     return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1067     }
1068    
1069    
1070 dpavlin 43 =head2 _fetch_doc
1071    
1072 dpavlin 44 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1073     C<etch_doc>, C<etch_doc_by_uri>.
1074 dpavlin 43
1075 dpavlin 45 # this will decode received draft into Search::Estraier::Document object
1076     my $doc = $node->_fetch_doc( id => 42 );
1077     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1078 dpavlin 43
1079 dpavlin 45 # to extract keywords, add etch
1080     my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1081     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1082    
1083 dpavlin 49 # to get document attrubute add attr
1084     my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1085     my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1086    
1087 dpavlin 45 # more general form which allows implementation of
1088     # uri_to_id
1089     my $id = $node->_fetch_doc(
1090     uri => 'file:///document/uri/42',
1091     path => '/uri_to_id',
1092     chomp_resbody => 1
1093     );
1094    
1095 dpavlin 43 =cut
1096    
1097     sub _fetch_doc {
1098     my $self = shift;
1099 dpavlin 44 my $a = {@_};
1100     return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1101    
1102     my ($arg, $resbody);
1103    
1104 dpavlin 45 my $path = $a->{path} || '/get_doc';
1105 dpavlin 44 $path = '/etch_doc' if ($a->{etch});
1106    
1107     if ($a->{id}) {
1108     croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1109     $arg = 'id=' . $a->{id};
1110     } elsif ($a->{uri}) {
1111 dpavlin 50 $arg = 'uri=' . uri_escape($a->{uri});
1112 dpavlin 44 } else {
1113     confess "unhandled argument. Need id or uri.";
1114 dpavlin 43 }
1115 dpavlin 44
1116 dpavlin 49 if ($a->{attr}) {
1117     $path = '/get_doc_attr';
1118     $arg .= '&attr=' . uri_escape($a->{attr});
1119     $a->{chomp_resbody} = 1;
1120     }
1121    
1122 dpavlin 44 my $rv = $self->shuttle_url( $self->{url} . $path,
1123 dpavlin 43 'application/x-www-form-urlencoded',
1124 dpavlin 44 $arg,
1125 dpavlin 45 \$resbody,
1126 dpavlin 43 );
1127 dpavlin 44
1128 dpavlin 43 return if ($rv != 200);
1129 dpavlin 44
1130     if ($a->{etch}) {
1131     $self->{kwords} = {};
1132     return +{} unless ($resbody);
1133     foreach my $l (split(/\n/, $resbody)) {
1134     my ($k,$v) = split(/\t/, $l, 2);
1135     $self->{kwords}->{$k} = $v if ($v);
1136     }
1137     return $self->{kwords};
1138 dpavlin 45 } elsif ($a->{chomp_resbody}) {
1139     return unless (defined($resbody));
1140     chomp($resbody);
1141     return $resbody;
1142 dpavlin 44 } else {
1143     return new Search::Estraier::Document($resbody);
1144     }
1145 dpavlin 43 }
1146    
1147    
1148 dpavlin 48 =head2 name
1149 dpavlin 43
1150 dpavlin 48 my $node_name = $node->name;
1151 dpavlin 43
1152 dpavlin 48 =cut
1153    
1154     sub name {
1155     my $self = shift;
1156 dpavlin 55 $self->_set_info unless ($self->{name});
1157 dpavlin 48 return $self->{name};
1158     }
1159    
1160    
1161     =head2 label
1162    
1163     my $node_label = $node->label;
1164    
1165     =cut
1166    
1167     sub label {
1168     my $self = shift;
1169 dpavlin 55 $self->_set_info unless ($self->{label});
1170 dpavlin 48 return $self->{label};
1171     }
1172    
1173    
1174     =head2 doc_num
1175    
1176     my $documents_in_node = $node->doc_num;
1177    
1178     =cut
1179    
1180     sub doc_num {
1181     my $self = shift;
1182 dpavlin 55 $self->_set_info if ($self->{dnum} < 0);
1183 dpavlin 48 return $self->{dnum};
1184     }
1185    
1186    
1187     =head2 word_num
1188    
1189     my $words_in_node = $node->word_num;
1190    
1191     =cut
1192    
1193     sub word_num {
1194     my $self = shift;
1195 dpavlin 55 $self->_set_info if ($self->{wnum} < 0);
1196 dpavlin 48 return $self->{wnum};
1197     }
1198    
1199    
1200     =head2 size
1201    
1202     my $node_size = $node->size;
1203    
1204     =cut
1205    
1206     sub size {
1207     my $self = shift;
1208 dpavlin 55 $self->_set_info if ($self->{size} < 0);
1209 dpavlin 48 return $self->{size};
1210     }
1211    
1212    
1213 dpavlin 51 =head2 search
1214 dpavlin 48
1215 dpavlin 51 Search documents which match condition
1216    
1217     my $nres = $node->search( $cond, $depth );
1218    
1219     C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1220     depth for meta search.
1221    
1222     Function results C<Search::Estraier::NodeResult> object.
1223    
1224     =cut
1225    
1226     sub search {
1227     my $self = shift;
1228     my ($cond, $depth) = @_;
1229     return unless ($cond && defined($depth) && $self->{url});
1230     croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1231     croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1232    
1233 dpavlin 52 my $resbody;
1234 dpavlin 51
1235 dpavlin 52 my $rv = $self->shuttle_url( $self->{url} . '/search',
1236 dpavlin 53 'application/x-www-form-urlencoded',
1237 dpavlin 61 $self->cond_to_query( $cond, $depth ),
1238 dpavlin 52 \$resbody,
1239     );
1240     return if ($rv != 200);
1241    
1242     my (@docs, $hints);
1243    
1244     my @lines = split(/\n/, $resbody);
1245     return unless (@lines);
1246    
1247     my $border = $lines[0];
1248     my $isend = 0;
1249     my $lnum = 1;
1250    
1251     while ( $lnum <= $#lines ) {
1252     my $line = $lines[$lnum];
1253     $lnum++;
1254    
1255     #warn "## $line\n";
1256     if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1257     $isend = $1;
1258     last;
1259     }
1260    
1261     if ($line =~ /\t/) {
1262     my ($k,$v) = split(/\t/, $line, 2);
1263     $hints->{$k} = $v;
1264     }
1265     }
1266    
1267     my $snum = $lnum;
1268    
1269     while( ! $isend && $lnum <= $#lines ) {
1270     my $line = $lines[$lnum];
1271 dpavlin 53 #warn "# $lnum: $line\n";
1272 dpavlin 52 $lnum++;
1273    
1274     if ($line && $line =~ m/^\Q$border\E/) {
1275     if ($lnum > $snum) {
1276     my $rdattrs;
1277     my $rdvector;
1278     my $rdsnippet;
1279    
1280     my $rlnum = $snum;
1281     while ($rlnum < $lnum - 1 ) {
1282     #my $rdline = $self->_s($lines[$rlnum]);
1283     my $rdline = $lines[$rlnum];
1284     $rlnum++;
1285     last unless ($rdline);
1286     if ($rdline =~ /^%/) {
1287     $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1288 dpavlin 53 } elsif($rdline =~ /=/) {
1289     $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1290 dpavlin 52 } else {
1291 dpavlin 53 confess "invalid format of response";
1292 dpavlin 52 }
1293     }
1294     while($rlnum < $lnum - 1) {
1295     my $rdline = $lines[$rlnum];
1296     $rlnum++;
1297     $rdsnippet .= "$rdline\n";
1298     }
1299 dpavlin 53 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1300 dpavlin 52 if (my $rduri = $rdattrs->{'@uri'}) {
1301     push @docs, new Search::Estraier::ResultDocument(
1302     uri => $rduri,
1303     attrs => $rdattrs,
1304     snippet => $rdsnippet,
1305     keywords => $rdvector,
1306     );
1307     }
1308     }
1309     $snum = $lnum;
1310     #warn "### $line\n";
1311     $isend = 1 if ($line =~ /:END$/);
1312     }
1313    
1314     }
1315    
1316     if (! $isend) {
1317     warn "received result doesn't have :END\n$resbody";
1318     return;
1319     }
1320    
1321 dpavlin 53 #warn Dumper(\@docs, $hints);
1322    
1323 dpavlin 52 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1324 dpavlin 51 }
1325    
1326    
1327     =head2 cond_to_query
1328    
1329 dpavlin 55 Return URI encoded string generated from Search::Estraier::Condition
1330    
1331 dpavlin 61 my $args = $node->cond_to_query( $cond, $depth );
1332 dpavlin 51
1333     =cut
1334    
1335     sub cond_to_query {
1336     my $self = shift;
1337    
1338     my $cond = shift || return;
1339     croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1340 dpavlin 61 my $depth = shift;
1341 dpavlin 51
1342     my @args;
1343    
1344     if (my $phrase = $cond->phrase) {
1345     push @args, 'phrase=' . uri_escape($phrase);
1346     }
1347    
1348     if (my @attrs = $cond->attrs) {
1349     for my $i ( 0 .. $#attrs ) {
1350 dpavlin 63 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1351 dpavlin 51 }
1352     }
1353    
1354     if (my $order = $cond->order) {
1355     push @args, 'order=' . uri_escape($order);
1356     }
1357    
1358     if (my $max = $cond->max) {
1359     push @args, 'max=' . $max;
1360     } else {
1361     push @args, 'max=' . (1 << 30);
1362     }
1363    
1364     if (my $options = $cond->options) {
1365     push @args, 'options=' . $options;
1366     }
1367    
1368 dpavlin 61 push @args, 'depth=' . $depth if ($depth);
1369 dpavlin 51 push @args, 'wwidth=' . $self->{wwidth};
1370     push @args, 'hwidth=' . $self->{hwidth};
1371     push @args, 'awidth=' . $self->{awidth};
1372    
1373     return join('&', @args);
1374     }
1375    
1376    
1377 dpavlin 33 =head2 shuttle_url
1378 dpavlin 32
1379 dpavlin 68 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1380 dpavlin 33 master.
1381 dpavlin 2
1382 dpavlin 52 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1383 dpavlin 2
1384 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1385     body will be saved within object.
1386 dpavlin 2
1387     =cut
1388    
1389 dpavlin 59 use LWP::UserAgent;
1390    
1391 dpavlin 33 sub shuttle_url {
1392     my $self = shift;
1393 dpavlin 2
1394 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
1395 dpavlin 2
1396 dpavlin 40 $self->{status} = -1;
1397 dpavlin 33
1398 dpavlin 41 warn "## $url\n" if ($self->{debug});
1399 dpavlin 36
1400 dpavlin 33 $url = new URI($url);
1401 dpavlin 37 if (
1402     !$url || !$url->scheme || !$url->scheme eq 'http' ||
1403     !$url->host || !$url->port || $url->port < 1
1404     ) {
1405     carp "can't parse $url\n";
1406     return -1;
1407     }
1408 dpavlin 33
1409 dpavlin 59 my $ua = LWP::UserAgent->new;
1410     $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1411 dpavlin 33
1412 dpavlin 59 my $req;
1413 dpavlin 37 if ($reqbody) {
1414 dpavlin 59 $req = HTTP::Request->new(POST => $url);
1415 dpavlin 37 } else {
1416 dpavlin 59 $req = HTTP::Request->new(GET => $url);
1417 dpavlin 37 }
1418    
1419 dpavlin 59 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1420     $req->headers->header( 'Connection', 'close' );
1421     $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );
1422     $req->content_type( $content_type );
1423 dpavlin 37
1424 dpavlin 59 warn $req->headers->as_string,"\n" if ($self->{debug});
1425 dpavlin 2
1426 dpavlin 37 if ($reqbody) {
1427 dpavlin 41 warn "$reqbody\n" if ($self->{debug});
1428 dpavlin 59 $req->content( $reqbody );
1429 dpavlin 33 }
1430 dpavlin 2
1431 dpavlin 59 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1432 dpavlin 2
1433 dpavlin 59 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1434 dpavlin 2
1435 dpavlin 59 return -1 if (! $res->is_success);
1436 dpavlin 2
1437 dpavlin 59 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1438 dpavlin 33
1439 dpavlin 59 $$resbody .= $res->content;
1440    
1441 dpavlin 40 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1442 dpavlin 39
1443 dpavlin 40 return $self->{status};
1444 dpavlin 2 }
1445    
1446 dpavlin 48
1447 dpavlin 55 =head2 set_snippet_width
1448 dpavlin 48
1449 dpavlin 55 Set width of snippets in results
1450    
1451     $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1452    
1453     C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1454     is not sent with results. If it is negative, whole document text is sent instead of snippet.
1455    
1456     C<$hwidth> specified width of strings from beginning of string. Default
1457     value is C<96>. Negative or zero value keep previous value.
1458    
1459     C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1460     If negative of zero value is provided previous value is kept unchanged.
1461    
1462     =cut
1463    
1464     sub set_snippet_width {
1465     my $self = shift;
1466    
1467     my ($wwidth, $hwidth, $awidth) = @_;
1468     $self->{wwidth} = $wwidth;
1469     $self->{hwidth} = $hwidth if ($hwidth >= 0);
1470     $self->{awidth} = $awidth if ($awidth >= 0);
1471     }
1472    
1473    
1474 dpavlin 56 =head2 set_user
1475 dpavlin 55
1476 dpavlin 56 Manage users of node
1477    
1478     $node->set_user( 'name', $mode );
1479    
1480     C<$mode> can be one of:
1481    
1482     =over 4
1483    
1484     =item 0
1485    
1486     delete account
1487    
1488     =item 1
1489    
1490     set administrative right for user
1491    
1492     =item 2
1493    
1494     set user account as guest
1495    
1496     =back
1497    
1498     Return true on success, otherwise false.
1499    
1500     =cut
1501    
1502     sub set_user {
1503     my $self = shift;
1504     my ($name, $mode) = @_;
1505    
1506     return unless ($self->{url});
1507     croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1508    
1509     $self->shuttle_url( $self->{url} . '/_set_user',
1510     'text/plain',
1511     'name=' . uri_escape($name) . '&mode=' . $mode,
1512     undef
1513     ) == 200;
1514     }
1515    
1516    
1517 dpavlin 57 =head2 set_link
1518    
1519     Manage node links
1520    
1521     $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1522    
1523     If C<$credit> is negative, link is removed.
1524    
1525     =cut
1526    
1527     sub set_link {
1528     my $self = shift;
1529     my ($url, $label, $credit) = @_;
1530    
1531     return unless ($self->{url});
1532     croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1533    
1534     my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1535     $reqbody .= '&credit=' . $credit if ($credit > 0);
1536    
1537     $self->shuttle_url( $self->{url} . '/_set_link',
1538     'text/plain',
1539     $reqbody,
1540     undef
1541     ) == 200;
1542     }
1543    
1544    
1545 dpavlin 55 =head1 PRIVATE METHODS
1546    
1547     You could call those directly, but you don't have to. I hope.
1548    
1549     =head2 _set_info
1550    
1551 dpavlin 48 Set information for node
1552    
1553 dpavlin 55 $node->_set_info;
1554 dpavlin 48
1555     =cut
1556    
1557 dpavlin 55 sub _set_info {
1558 dpavlin 48 my $self = shift;
1559    
1560     $self->{status} = -1;
1561     return unless ($self->{url});
1562    
1563     my $resbody;
1564     my $rv = $self->shuttle_url( $self->{url} . '/inform',
1565     'text/plain',
1566     undef,
1567     \$resbody,
1568     );
1569    
1570     return if ($rv != 200 || !$resbody);
1571    
1572 dpavlin 58 # it seems that response can have multiple line endings
1573     $resbody =~ s/[\r\n]+$//;
1574 dpavlin 48
1575     ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1576     split(/\t/, $resbody, 5);
1577    
1578     }
1579    
1580 dpavlin 2 ###
1581    
1582     =head1 EXPORT
1583    
1584     Nothing.
1585    
1586     =head1 SEE ALSO
1587    
1588     L<http://hyperestraier.sourceforge.net/>
1589    
1590     Hyper Estraier Ruby interface on which this module is based.
1591    
1592     =head1 AUTHOR
1593    
1594     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1595    
1596    
1597     =head1 COPYRIGHT AND LICENSE
1598    
1599 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1600 dpavlin 2
1601     This library is free software; you can redistribute it and/or modify
1602     it under the GPL v2 or later.
1603    
1604     =cut
1605    
1606     1;

  ViewVC Help
Powered by ViewVC 1.1.26