/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 122 - (show annotations)
Tue May 2 10:19:47 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 32258 byte(s)
fixed warning
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.05_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 =head2 set_skip
603
604 Set number of skipped documents from beginning of results
605
606 $cond->set_skip(42);
607
608 Similar to C<offset> in RDBMS.
609
610 =cut
611
612 sub set_skip {
613 my $self = shift;
614 $self->{skip} = shift;
615 }
616
617 =head2 skip
618
619 Return skip for this condition.
620
621 print $cond->skip;
622
623 =cut
624
625 sub skip {
626 my $self = shift;
627 return $self->{skip};
628 }
629
630
631 package Search::Estraier::ResultDocument;
632
633 use Carp qw/croak/;
634
635 #use Search::Estraier;
636 #our @ISA = qw/Search::Estraier/;
637
638 =head1 Search::Estraier::ResultDocument
639
640 =head2 new
641
642 my $rdoc = new Search::HyperEstraier::ResultDocument(
643 uri => 'http://localhost/document/uri/42',
644 attrs => {
645 foo => 1,
646 bar => 2,
647 },
648 snippet => 'this is a text of snippet'
649 keywords => 'this\tare\tkeywords'
650 );
651
652 =cut
653
654 sub new {
655 my $class = shift;
656 my $self = {@_};
657 bless($self, $class);
658
659 croak "missing uri for ResultDocument" unless defined($self->{uri});
660
661 $self ? return $self : return undef;
662 }
663
664
665 =head2 uri
666
667 Return URI of result document
668
669 print $rdoc->uri;
670
671 =cut
672
673 sub uri {
674 my $self = shift;
675 return $self->{uri};
676 }
677
678
679 =head2 attr_names
680
681 Returns array with attribute names from result document object.
682
683 my @attrs = $rdoc->attr_names;
684
685 =cut
686
687 sub attr_names {
688 my $self = shift;
689 croak "attr_names return array, not scalar" if (! wantarray);
690 return sort keys %{ $self->{attrs} };
691 }
692
693
694 =head2 attr
695
696 Returns value of an attribute.
697
698 my $value = $rdoc->attr( 'attribute' );
699
700 =cut
701
702 sub attr {
703 my $self = shift;
704 my $name = shift || return;
705 return $self->{attrs}->{ $name };
706 }
707
708
709 =head2 snippet
710
711 Return snippet from result document
712
713 print $rdoc->snippet;
714
715 =cut
716
717 sub snippet {
718 my $self = shift;
719 return $self->{snippet};
720 }
721
722
723 =head2 keywords
724
725 Return keywords from result document
726
727 print $rdoc->keywords;
728
729 =cut
730
731 sub keywords {
732 my $self = shift;
733 return $self->{keywords};
734 }
735
736
737 package Search::Estraier::NodeResult;
738
739 use Carp qw/croak/;
740
741 #use Search::Estraier;
742 #our @ISA = qw/Search::Estraier/;
743
744 =head1 Search::Estraier::NodeResult
745
746 =head2 new
747
748 my $res = new Search::HyperEstraier::NodeResult(
749 docs => @array_of_rdocs,
750 hits => %hash_with_hints,
751 );
752
753 =cut
754
755 sub new {
756 my $class = shift;
757 my $self = {@_};
758 bless($self, $class);
759
760 foreach my $f (qw/docs hints/) {
761 croak "missing $f for ResultDocument" unless defined($self->{$f});
762 }
763
764 $self ? return $self : return undef;
765 }
766
767
768 =head2 doc_num
769
770 Return number of documents
771
772 print $res->doc_num;
773
774 This will return real number of documents (limited by C<max>).
775 If you want to get total number of hits, see C<hits>.
776
777 =cut
778
779 sub doc_num {
780 my $self = shift;
781 return $#{$self->{docs}} + 1;
782 }
783
784
785 =head2 get_doc
786
787 Return single document
788
789 my $doc = $res->get_doc( 42 );
790
791 Returns undef if document doesn't exist.
792
793 =cut
794
795 sub get_doc {
796 my $self = shift;
797 my $num = shift;
798 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
799 return undef if ($num < 0 || $num > $self->{docs});
800 return $self->{docs}->[$num];
801 }
802
803
804 =head2 hint
805
806 Return specific hint from results.
807
808 print $res->hint( 'VERSION' );
809
810 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
811 C<TIME>, C<LINK#n>, C<VIEW>.
812
813 =cut
814
815 sub hint {
816 my $self = shift;
817 my $key = shift || return;
818 return $self->{hints}->{$key};
819 }
820
821 =head2 hints
822
823 More perlish version of C<hint>. This one returns hash.
824
825 my %hints = $res->hints;
826
827 =cut
828
829 sub hints {
830 my $self = shift;
831 return $self->{hints};
832 }
833
834 =head2 hits
835
836 Syntaxtic sugar for total number of hits for this query
837
838 print $res->hits;
839
840 It's same as
841
842 print $res->hint('HIT');
843
844 but shorter.
845
846 =cut
847
848 sub hits {
849 my $self = shift;
850 return $self->{hints}->{'HIT'} || 0;
851 }
852
853 package Search::Estraier::Node;
854
855 use Carp qw/carp croak confess/;
856 use URI;
857 use MIME::Base64;
858 use IO::Socket::INET;
859 use URI::Escape qw/uri_escape/;
860
861 =head1 Search::Estraier::Node
862
863 =head2 new
864
865 my $node = new Search::HyperEstraier::Node;
866
867 or optionally with C<url> as parametar
868
869 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
870
871 or in more verbose form
872
873 my $node = new Search::HyperEstraier::Node(
874 url => 'http://localhost:1978/node/test',
875 debug => 1,
876 croak_on_error => 1
877 );
878
879 with following arguments:
880
881 =over 4
882
883 =item url
884
885 URL to node
886
887 =item debug
888
889 dumps a B<lot> of debugging output
890
891 =item croak_on_error
892
893 very helpful during development. It will croak on all errors instead of
894 silently returning C<-1> (which is convention of Hyper Estraier API in other
895 languages).
896
897 =back
898
899 =cut
900
901 sub new {
902 my $class = shift;
903 my $self = {
904 pxport => -1,
905 timeout => 0, # this used to be -1
906 wwidth => 480,
907 hwidth => 96,
908 awidth => 96,
909 status => -1,
910 };
911
912 bless($self, $class);
913
914 if ($#_ == 0) {
915 $self->{url} = shift;
916 } else {
917 my $args = {@_};
918
919 %$self = ( %$self, @_ );
920
921 warn "## Node debug on\n" if ($self->{debug});
922 }
923
924 $self->{inform} = {
925 dnum => -1,
926 wnum => -1,
927 size => -1.0,
928 };
929
930 $self ? return $self : return undef;
931 }
932
933
934 =head2 set_url
935
936 Specify URL to node server
937
938 $node->set_url('http://localhost:1978');
939
940 =cut
941
942 sub set_url {
943 my $self = shift;
944 $self->{url} = shift;
945 }
946
947
948 =head2 set_proxy
949
950 Specify proxy server to connect to node server
951
952 $node->set_proxy('proxy.example.com', 8080);
953
954 =cut
955
956 sub set_proxy {
957 my $self = shift;
958 my ($host,$port) = @_;
959 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
960 $self->{pxhost} = $host;
961 $self->{pxport} = $port;
962 }
963
964
965 =head2 set_timeout
966
967 Specify timeout of connection in seconds
968
969 $node->set_timeout( 15 );
970
971 =cut
972
973 sub set_timeout {
974 my $self = shift;
975 my $sec = shift;
976 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
977 $self->{timeout} = $sec;
978 }
979
980
981 =head2 set_auth
982
983 Specify name and password for authentication to node server.
984
985 $node->set_auth('clint','eastwood');
986
987 =cut
988
989 sub set_auth {
990 my $self = shift;
991 my ($login,$passwd) = @_;
992 my $basic_auth = encode_base64( "$login:$passwd" );
993 chomp($basic_auth);
994 $self->{auth} = $basic_auth;
995 }
996
997
998 =head2 status
999
1000 Return status code of last request.
1001
1002 print $node->status;
1003
1004 C<-1> means connection failure.
1005
1006 =cut
1007
1008 sub status {
1009 my $self = shift;
1010 return $self->{status};
1011 }
1012
1013
1014 =head2 put_doc
1015
1016 Add a document
1017
1018 $node->put_doc( $document_draft ) or die "can't add document";
1019
1020 Return true on success or false on failture.
1021
1022 =cut
1023
1024 sub put_doc {
1025 my $self = shift;
1026 my $doc = shift || return;
1027 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1028 $self->shuttle_url( $self->{url} . '/put_doc',
1029 'text/x-estraier-draft',
1030 $doc->dump_draft,
1031 undef
1032 ) == 200;
1033 }
1034
1035
1036 =head2 out_doc
1037
1038 Remove a document
1039
1040 $node->out_doc( document_id ) or "can't remove document";
1041
1042 Return true on success or false on failture.
1043
1044 =cut
1045
1046 sub out_doc {
1047 my $self = shift;
1048 my $id = shift || return;
1049 return unless ($self->{url});
1050 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1051 $self->shuttle_url( $self->{url} . '/out_doc',
1052 'application/x-www-form-urlencoded',
1053 "id=$id",
1054 undef
1055 ) == 200;
1056 }
1057
1058
1059 =head2 out_doc_by_uri
1060
1061 Remove a registrated document using it's uri
1062
1063 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1064
1065 Return true on success or false on failture.
1066
1067 =cut
1068
1069 sub out_doc_by_uri {
1070 my $self = shift;
1071 my $uri = shift || return;
1072 return unless ($self->{url});
1073 $self->shuttle_url( $self->{url} . '/out_doc',
1074 'application/x-www-form-urlencoded',
1075 "uri=" . uri_escape($uri),
1076 undef
1077 ) == 200;
1078 }
1079
1080
1081 =head2 edit_doc
1082
1083 Edit attributes of a document
1084
1085 $node->edit_doc( $document_draft ) or die "can't edit document";
1086
1087 Return true on success or false on failture.
1088
1089 =cut
1090
1091 sub edit_doc {
1092 my $self = shift;
1093 my $doc = shift || return;
1094 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1095 $self->shuttle_url( $self->{url} . '/edit_doc',
1096 'text/x-estraier-draft',
1097 $doc->dump_draft,
1098 undef
1099 ) == 200;
1100 }
1101
1102
1103 =head2 get_doc
1104
1105 Retreive document
1106
1107 my $doc = $node->get_doc( document_id ) or die "can't get document";
1108
1109 Return true on success or false on failture.
1110
1111 =cut
1112
1113 sub get_doc {
1114 my $self = shift;
1115 my $id = shift || return;
1116 return $self->_fetch_doc( id => $id );
1117 }
1118
1119
1120 =head2 get_doc_by_uri
1121
1122 Retreive document
1123
1124 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1125
1126 Return true on success or false on failture.
1127
1128 =cut
1129
1130 sub get_doc_by_uri {
1131 my $self = shift;
1132 my $uri = shift || return;
1133 return $self->_fetch_doc( uri => $uri );
1134 }
1135
1136
1137 =head2 get_doc_attr
1138
1139 Retrieve the value of an atribute from object
1140
1141 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1142 die "can't get document attribute";
1143
1144 =cut
1145
1146 sub get_doc_attr {
1147 my $self = shift;
1148 my ($id,$name) = @_;
1149 return unless ($id && $name);
1150 return $self->_fetch_doc( id => $id, attr => $name );
1151 }
1152
1153
1154 =head2 get_doc_attr_by_uri
1155
1156 Retrieve the value of an atribute from object
1157
1158 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1159 die "can't get document attribute";
1160
1161 =cut
1162
1163 sub get_doc_attr_by_uri {
1164 my $self = shift;
1165 my ($uri,$name) = @_;
1166 return unless ($uri && $name);
1167 return $self->_fetch_doc( uri => $uri, attr => $name );
1168 }
1169
1170
1171 =head2 etch_doc
1172
1173 Exctract document keywords
1174
1175 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1176
1177 =cut
1178
1179 sub etch_doc {
1180 my $self = shift;
1181 my $id = shift || return;
1182 return $self->_fetch_doc( id => $id, etch => 1 );
1183 }
1184
1185 =head2 etch_doc_by_uri
1186
1187 Retreive document
1188
1189 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1190
1191 Return true on success or false on failture.
1192
1193 =cut
1194
1195 sub etch_doc_by_uri {
1196 my $self = shift;
1197 my $uri = shift || return;
1198 return $self->_fetch_doc( uri => $uri, etch => 1 );
1199 }
1200
1201
1202 =head2 uri_to_id
1203
1204 Get ID of document specified by URI
1205
1206 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1207
1208 This method won't croak, even if using C<croak_on_error>.
1209
1210 =cut
1211
1212 sub uri_to_id {
1213 my $self = shift;
1214 my $uri = shift || return;
1215 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1216 }
1217
1218
1219 =head2 _fetch_doc
1220
1221 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1222 C<etch_doc>, C<etch_doc_by_uri>.
1223
1224 # this will decode received draft into Search::Estraier::Document object
1225 my $doc = $node->_fetch_doc( id => 42 );
1226 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1227
1228 # to extract keywords, add etch
1229 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1230 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1231
1232 # to get document attrubute add attr
1233 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1234 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1235
1236 # more general form which allows implementation of
1237 # uri_to_id
1238 my $id = $node->_fetch_doc(
1239 uri => 'file:///document/uri/42',
1240 path => '/uri_to_id',
1241 chomp_resbody => 1
1242 );
1243
1244 =cut
1245
1246 sub _fetch_doc {
1247 my $self = shift;
1248 my $a = {@_};
1249 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1250
1251 my ($arg, $resbody);
1252
1253 my $path = $a->{path} || '/get_doc';
1254 $path = '/etch_doc' if ($a->{etch});
1255
1256 if ($a->{id}) {
1257 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1258 $arg = 'id=' . $a->{id};
1259 } elsif ($a->{uri}) {
1260 $arg = 'uri=' . uri_escape($a->{uri});
1261 } else {
1262 confess "unhandled argument. Need id or uri.";
1263 }
1264
1265 if ($a->{attr}) {
1266 $path = '/get_doc_attr';
1267 $arg .= '&attr=' . uri_escape($a->{attr});
1268 $a->{chomp_resbody} = 1;
1269 }
1270
1271 my $rv = $self->shuttle_url( $self->{url} . $path,
1272 'application/x-www-form-urlencoded',
1273 $arg,
1274 \$resbody,
1275 $a->{croak_on_error},
1276 );
1277
1278 return if ($rv != 200);
1279
1280 if ($a->{etch}) {
1281 $self->{kwords} = {};
1282 return +{} unless ($resbody);
1283 foreach my $l (split(/\n/, $resbody)) {
1284 my ($k,$v) = split(/\t/, $l, 2);
1285 $self->{kwords}->{$k} = $v if ($v);
1286 }
1287 return $self->{kwords};
1288 } elsif ($a->{chomp_resbody}) {
1289 return unless (defined($resbody));
1290 chomp($resbody);
1291 return $resbody;
1292 } else {
1293 return new Search::Estraier::Document($resbody);
1294 }
1295 }
1296
1297
1298 =head2 name
1299
1300 my $node_name = $node->name;
1301
1302 =cut
1303
1304 sub name {
1305 my $self = shift;
1306 $self->_set_info unless ($self->{inform}->{name});
1307 return $self->{inform}->{name};
1308 }
1309
1310
1311 =head2 label
1312
1313 my $node_label = $node->label;
1314
1315 =cut
1316
1317 sub label {
1318 my $self = shift;
1319 $self->_set_info unless ($self->{inform}->{label});
1320 return $self->{inform}->{label};
1321 }
1322
1323
1324 =head2 doc_num
1325
1326 my $documents_in_node = $node->doc_num;
1327
1328 =cut
1329
1330 sub doc_num {
1331 my $self = shift;
1332 $self->_set_info if ($self->{inform}->{dnum} < 0);
1333 return $self->{inform}->{dnum};
1334 }
1335
1336
1337 =head2 word_num
1338
1339 my $words_in_node = $node->word_num;
1340
1341 =cut
1342
1343 sub word_num {
1344 my $self = shift;
1345 $self->_set_info if ($self->{inform}->{wnum} < 0);
1346 return $self->{inform}->{wnum};
1347 }
1348
1349
1350 =head2 size
1351
1352 my $node_size = $node->size;
1353
1354 =cut
1355
1356 sub size {
1357 my $self = shift;
1358 $self->_set_info if ($self->{inform}->{size} < 0);
1359 return $self->{inform}->{size};
1360 }
1361
1362
1363 =head2 search
1364
1365 Search documents which match condition
1366
1367 my $nres = $node->search( $cond, $depth );
1368
1369 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1370 depth for meta search.
1371
1372 Function results C<Search::Estraier::NodeResult> object.
1373
1374 =cut
1375
1376 sub search {
1377 my $self = shift;
1378 my ($cond, $depth) = @_;
1379 return unless ($cond && defined($depth) && $self->{url});
1380 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1381 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1382
1383 my $resbody;
1384
1385 my $rv = $self->shuttle_url( $self->{url} . '/search',
1386 'application/x-www-form-urlencoded',
1387 $self->cond_to_query( $cond, $depth ),
1388 \$resbody,
1389 );
1390 return if ($rv != 200);
1391
1392 my (@docs, $hints);
1393
1394 my @lines = split(/\n/, $resbody);
1395 return unless (@lines);
1396
1397 my $border = $lines[0];
1398 my $isend = 0;
1399 my $lnum = 1;
1400
1401 while ( $lnum <= $#lines ) {
1402 my $line = $lines[$lnum];
1403 $lnum++;
1404
1405 #warn "## $line\n";
1406 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1407 $isend = $1;
1408 last;
1409 }
1410
1411 if ($line =~ /\t/) {
1412 my ($k,$v) = split(/\t/, $line, 2);
1413 $hints->{$k} = $v;
1414 }
1415 }
1416
1417 my $snum = $lnum;
1418
1419 while( ! $isend && $lnum <= $#lines ) {
1420 my $line = $lines[$lnum];
1421 #warn "# $lnum: $line\n";
1422 $lnum++;
1423
1424 if ($line && $line =~ m/^\Q$border\E/) {
1425 if ($lnum > $snum) {
1426 my $rdattrs;
1427 my $rdvector;
1428 my $rdsnippet;
1429
1430 my $rlnum = $snum;
1431 while ($rlnum < $lnum - 1 ) {
1432 #my $rdline = $self->_s($lines[$rlnum]);
1433 my $rdline = $lines[$rlnum];
1434 $rlnum++;
1435 last unless ($rdline);
1436 if ($rdline =~ /^%/) {
1437 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1438 } elsif($rdline =~ /=/) {
1439 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1440 } else {
1441 confess "invalid format of response";
1442 }
1443 }
1444 while($rlnum < $lnum - 1) {
1445 my $rdline = $lines[$rlnum];
1446 $rlnum++;
1447 $rdsnippet .= "$rdline\n";
1448 }
1449 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1450 if (my $rduri = $rdattrs->{'@uri'}) {
1451 push @docs, new Search::Estraier::ResultDocument(
1452 uri => $rduri,
1453 attrs => $rdattrs,
1454 snippet => $rdsnippet,
1455 keywords => $rdvector,
1456 );
1457 }
1458 }
1459 $snum = $lnum;
1460 #warn "### $line\n";
1461 $isend = 1 if ($line =~ /:END$/);
1462 }
1463
1464 }
1465
1466 if (! $isend) {
1467 warn "received result doesn't have :END\n$resbody";
1468 return;
1469 }
1470
1471 #warn Dumper(\@docs, $hints);
1472
1473 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1474 }
1475
1476
1477 =head2 cond_to_query
1478
1479 Return URI encoded string generated from Search::Estraier::Condition
1480
1481 my $args = $node->cond_to_query( $cond, $depth );
1482
1483 =cut
1484
1485 sub cond_to_query {
1486 my $self = shift;
1487
1488 my $cond = shift || return;
1489 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1490 my $depth = shift;
1491
1492 my @args;
1493
1494 if (my $phrase = $cond->phrase) {
1495 push @args, 'phrase=' . uri_escape($phrase);
1496 }
1497
1498 if (my @attrs = $cond->attrs) {
1499 for my $i ( 0 .. $#attrs ) {
1500 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1501 }
1502 }
1503
1504 if (my $order = $cond->order) {
1505 push @args, 'order=' . uri_escape($order);
1506 }
1507
1508 if (my $max = $cond->max) {
1509 push @args, 'max=' . $max;
1510 } else {
1511 push @args, 'max=' . (1 << 30);
1512 }
1513
1514 if (my $options = $cond->options) {
1515 push @args, 'options=' . $options;
1516 }
1517
1518 push @args, 'depth=' . $depth if ($depth);
1519 push @args, 'wwidth=' . $self->{wwidth};
1520 push @args, 'hwidth=' . $self->{hwidth};
1521 push @args, 'awidth=' . $self->{awidth};
1522 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1523
1524 return join('&', @args);
1525 }
1526
1527
1528 =head2 shuttle_url
1529
1530 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1531 master.
1532
1533 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1534
1535 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1536 body will be saved within object.
1537
1538 =cut
1539
1540 use LWP::UserAgent;
1541
1542 sub shuttle_url {
1543 my $self = shift;
1544
1545 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1546
1547 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1548
1549 $self->{status} = -1;
1550
1551 warn "## $url\n" if ($self->{debug});
1552
1553 $url = new URI($url);
1554 if (
1555 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1556 !$url->host || !$url->port || $url->port < 1
1557 ) {
1558 carp "can't parse $url\n";
1559 return -1;
1560 }
1561
1562 my $ua = LWP::UserAgent->new;
1563 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1564
1565 my $req;
1566 if ($reqbody) {
1567 $req = HTTP::Request->new(POST => $url);
1568 } else {
1569 $req = HTTP::Request->new(GET => $url);
1570 }
1571
1572 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1573 $req->headers->header( 'Connection', 'close' );
1574 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1575 $req->content_type( $content_type );
1576
1577 warn $req->headers->as_string,"\n" if ($self->{debug});
1578
1579 if ($reqbody) {
1580 warn "$reqbody\n" if ($self->{debug});
1581 $req->content( $reqbody );
1582 }
1583
1584 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1585
1586 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1587
1588 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1589
1590 if (! $res->is_success) {
1591 if ($croak_on_error) {
1592 croak("can't get $url: ",$res->status_line);
1593 } else {
1594 return -1;
1595 }
1596 }
1597
1598 $$resbody .= $res->content;
1599
1600 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1601
1602 return $self->{status};
1603 }
1604
1605
1606 =head2 set_snippet_width
1607
1608 Set width of snippets in results
1609
1610 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1611
1612 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1613 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1614
1615 C<$hwidth> specified width of strings from beginning of string. Default
1616 value is C<96>. Negative or zero value keep previous value.
1617
1618 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1619 If negative of zero value is provided previous value is kept unchanged.
1620
1621 =cut
1622
1623 sub set_snippet_width {
1624 my $self = shift;
1625
1626 my ($wwidth, $hwidth, $awidth) = @_;
1627 $self->{wwidth} = $wwidth;
1628 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1629 $self->{awidth} = $awidth if ($awidth >= 0);
1630 }
1631
1632
1633 =head2 set_user
1634
1635 Manage users of node
1636
1637 $node->set_user( 'name', $mode );
1638
1639 C<$mode> can be one of:
1640
1641 =over 4
1642
1643 =item 0
1644
1645 delete account
1646
1647 =item 1
1648
1649 set administrative right for user
1650
1651 =item 2
1652
1653 set user account as guest
1654
1655 =back
1656
1657 Return true on success, otherwise false.
1658
1659 =cut
1660
1661 sub set_user {
1662 my $self = shift;
1663 my ($name, $mode) = @_;
1664
1665 return unless ($self->{url});
1666 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1667
1668 $self->shuttle_url( $self->{url} . '/_set_user',
1669 'text/plain',
1670 'name=' . uri_escape($name) . '&mode=' . $mode,
1671 undef
1672 ) == 200;
1673 }
1674
1675
1676 =head2 set_link
1677
1678 Manage node links
1679
1680 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1681
1682 If C<$credit> is negative, link is removed.
1683
1684 =cut
1685
1686 sub set_link {
1687 my $self = shift;
1688 my ($url, $label, $credit) = @_;
1689
1690 return unless ($self->{url});
1691 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1692
1693 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1694 $reqbody .= '&credit=' . $credit if ($credit > 0);
1695
1696 if ($self->shuttle_url( $self->{url} . '/_set_link',
1697 'application/x-www-form-urlencoded',
1698 $reqbody,
1699 undef
1700 ) == 200) {
1701 # refresh node info after adding link
1702 $self->_set_info;
1703 return 1;
1704 }
1705 }
1706
1707 =head2 admins
1708
1709 my @admins = @{ $node->admins };
1710
1711 Return array of users with admin rights on node
1712
1713 =cut
1714
1715 sub admins {
1716 my $self = shift;
1717 $self->_set_info unless ($self->{inform}->{name});
1718 return $self->{inform}->{admins};
1719 }
1720
1721 =head2 guests
1722
1723 my @guests = @{ $node->guests };
1724
1725 Return array of users with guest rights on node
1726
1727 =cut
1728
1729 sub guests {
1730 my $self = shift;
1731 $self->_set_info unless ($self->{inform}->{name});
1732 return $self->{inform}->{guests};
1733 }
1734
1735 =head2 links
1736
1737 my $links = @{ $node->links };
1738
1739 Return array of links for this node
1740
1741 =cut
1742
1743 sub links {
1744 my $self = shift;
1745 $self->_set_info unless ($self->{inform}->{name});
1746 return $self->{inform}->{links};
1747 }
1748
1749
1750 =head1 PRIVATE METHODS
1751
1752 You could call those directly, but you don't have to. I hope.
1753
1754 =head2 _set_info
1755
1756 Set information for node
1757
1758 $node->_set_info;
1759
1760 =cut
1761
1762 sub _set_info {
1763 my $self = shift;
1764
1765 $self->{status} = -1;
1766 return unless ($self->{url});
1767
1768 my $resbody;
1769 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1770 'text/plain',
1771 undef,
1772 \$resbody,
1773 );
1774
1775 return if ($rv != 200 || !$resbody);
1776
1777 my @lines = split(/[\r\n]/,$resbody);
1778
1779 $self->{inform} = {};
1780
1781 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1782 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1783
1784 return $resbody unless (@lines);
1785
1786 shift @lines;
1787
1788 while(my $admin = shift @lines) {
1789 push @{$self->{inform}->{admins}}, $admin;
1790 }
1791
1792 while(my $guest = shift @lines) {
1793 push @{$self->{inform}->{guests}}, $guest;
1794 }
1795
1796 while(my $link = shift @lines) {
1797 push @{$self->{inform}->{links}}, $link;
1798 }
1799
1800 return $resbody;
1801
1802 }
1803
1804 ###
1805
1806 =head1 EXPORT
1807
1808 Nothing.
1809
1810 =head1 SEE ALSO
1811
1812 L<http://hyperestraier.sourceforge.net/>
1813
1814 Hyper Estraier Ruby interface on which this module is based.
1815
1816 =head1 AUTHOR
1817
1818 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1819
1820
1821 =head1 COPYRIGHT AND LICENSE
1822
1823 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1824
1825 This library is free software; you can redistribute it and/or modify
1826 it under the GPL v2 or later.
1827
1828 =cut
1829
1830 1;

  ViewVC Help
Powered by ViewVC 1.1.26