/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 134 - (show annotations)
Tue May 9 12:21:26 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 33965 byte(s)
added Search::Estraier::Node->master to controll estmaster and beginning of tests for it
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 =head2 set_skip
603
604 Set number of skipped documents from beginning of results
605
606 $cond->set_skip(42);
607
608 Similar to C<offset> in RDBMS.
609
610 =cut
611
612 sub set_skip {
613 my $self = shift;
614 $self->{skip} = shift;
615 }
616
617 =head2 skip
618
619 Return skip for this condition.
620
621 print $cond->skip;
622
623 =cut
624
625 sub skip {
626 my $self = shift;
627 return $self->{skip};
628 }
629
630
631 package Search::Estraier::ResultDocument;
632
633 use Carp qw/croak/;
634
635 #use Search::Estraier;
636 #our @ISA = qw/Search::Estraier/;
637
638 =head1 Search::Estraier::ResultDocument
639
640 =head2 new
641
642 my $rdoc = new Search::HyperEstraier::ResultDocument(
643 uri => 'http://localhost/document/uri/42',
644 attrs => {
645 foo => 1,
646 bar => 2,
647 },
648 snippet => 'this is a text of snippet'
649 keywords => 'this\tare\tkeywords'
650 );
651
652 =cut
653
654 sub new {
655 my $class = shift;
656 my $self = {@_};
657 bless($self, $class);
658
659 croak "missing uri for ResultDocument" unless defined($self->{uri});
660
661 $self ? return $self : return undef;
662 }
663
664
665 =head2 uri
666
667 Return URI of result document
668
669 print $rdoc->uri;
670
671 =cut
672
673 sub uri {
674 my $self = shift;
675 return $self->{uri};
676 }
677
678
679 =head2 attr_names
680
681 Returns array with attribute names from result document object.
682
683 my @attrs = $rdoc->attr_names;
684
685 =cut
686
687 sub attr_names {
688 my $self = shift;
689 croak "attr_names return array, not scalar" if (! wantarray);
690 return sort keys %{ $self->{attrs} };
691 }
692
693
694 =head2 attr
695
696 Returns value of an attribute.
697
698 my $value = $rdoc->attr( 'attribute' );
699
700 =cut
701
702 sub attr {
703 my $self = shift;
704 my $name = shift || return;
705 return $self->{attrs}->{ $name };
706 }
707
708
709 =head2 snippet
710
711 Return snippet from result document
712
713 print $rdoc->snippet;
714
715 =cut
716
717 sub snippet {
718 my $self = shift;
719 return $self->{snippet};
720 }
721
722
723 =head2 keywords
724
725 Return keywords from result document
726
727 print $rdoc->keywords;
728
729 =cut
730
731 sub keywords {
732 my $self = shift;
733 return $self->{keywords};
734 }
735
736
737 package Search::Estraier::NodeResult;
738
739 use Carp qw/croak/;
740
741 #use Search::Estraier;
742 #our @ISA = qw/Search::Estraier/;
743
744 =head1 Search::Estraier::NodeResult
745
746 =head2 new
747
748 my $res = new Search::HyperEstraier::NodeResult(
749 docs => @array_of_rdocs,
750 hits => %hash_with_hints,
751 );
752
753 =cut
754
755 sub new {
756 my $class = shift;
757 my $self = {@_};
758 bless($self, $class);
759
760 foreach my $f (qw/docs hints/) {
761 croak "missing $f for ResultDocument" unless defined($self->{$f});
762 }
763
764 $self ? return $self : return undef;
765 }
766
767
768 =head2 doc_num
769
770 Return number of documents
771
772 print $res->doc_num;
773
774 This will return real number of documents (limited by C<max>).
775 If you want to get total number of hits, see C<hits>.
776
777 =cut
778
779 sub doc_num {
780 my $self = shift;
781 return $#{$self->{docs}} + 1;
782 }
783
784
785 =head2 get_doc
786
787 Return single document
788
789 my $doc = $res->get_doc( 42 );
790
791 Returns undef if document doesn't exist.
792
793 =cut
794
795 sub get_doc {
796 my $self = shift;
797 my $num = shift;
798 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
799 return undef if ($num < 0 || $num > $self->{docs});
800 return $self->{docs}->[$num];
801 }
802
803
804 =head2 hint
805
806 Return specific hint from results.
807
808 print $res->hint( 'VERSION' );
809
810 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
811 C<TIME>, C<LINK#n>, C<VIEW>.
812
813 =cut
814
815 sub hint {
816 my $self = shift;
817 my $key = shift || return;
818 return $self->{hints}->{$key};
819 }
820
821 =head2 hints
822
823 More perlish version of C<hint>. This one returns hash.
824
825 my %hints = $res->hints;
826
827 =cut
828
829 sub hints {
830 my $self = shift;
831 return $self->{hints};
832 }
833
834 =head2 hits
835
836 Syntaxtic sugar for total number of hits for this query
837
838 print $res->hits;
839
840 It's same as
841
842 print $res->hint('HIT');
843
844 but shorter.
845
846 =cut
847
848 sub hits {
849 my $self = shift;
850 return $self->{hints}->{'HIT'} || 0;
851 }
852
853 package Search::Estraier::Node;
854
855 use Carp qw/carp croak confess/;
856 use URI;
857 use MIME::Base64;
858 use IO::Socket::INET;
859 use URI::Escape qw/uri_escape/;
860
861 =head1 Search::Estraier::Node
862
863 =head2 new
864
865 my $node = new Search::HyperEstraier::Node;
866
867 or optionally with C<url> as parametar
868
869 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
870
871 or in more verbose form
872
873 my $node = new Search::HyperEstraier::Node(
874 url => 'http://localhost:1978/node/test',
875 user => 'admin',
876 passwd => 'admin'
877 debug => 1,
878 croak_on_error => 1
879 );
880
881 with following arguments:
882
883 =over 4
884
885 =item url
886
887 URL to node
888
889 =item user
890
891 specify username for node server authentication
892
893 =item passwd
894
895 password for authentication
896
897 =item debug
898
899 dumps a B<lot> of debugging output
900
901 =item croak_on_error
902
903 very helpful during development. It will croak on all errors instead of
904 silently returning C<-1> (which is convention of Hyper Estraier API in other
905 languages).
906
907 =back
908
909 =cut
910
911 sub new {
912 my $class = shift;
913 my $self = {
914 pxport => -1,
915 timeout => 0, # this used to be -1
916 wwidth => 480,
917 hwidth => 96,
918 awidth => 96,
919 status => -1,
920 };
921
922 bless($self, $class);
923
924 if ($#_ == 0) {
925 $self->{url} = shift;
926 } else {
927 %$self = ( %$self, @_ );
928
929 $self->set_auth( $self->{user}, $self->{passwd} ) if ($self->{user});
930
931 warn "## Node debug on\n" if ($self->{debug});
932 }
933
934 $self->{inform} = {
935 dnum => -1,
936 wnum => -1,
937 size => -1.0,
938 };
939
940 $self ? return $self : return undef;
941 }
942
943
944 =head2 set_url
945
946 Specify URL to node server
947
948 $node->set_url('http://localhost:1978');
949
950 =cut
951
952 sub set_url {
953 my $self = shift;
954 $self->{url} = shift;
955 }
956
957
958 =head2 set_proxy
959
960 Specify proxy server to connect to node server
961
962 $node->set_proxy('proxy.example.com', 8080);
963
964 =cut
965
966 sub set_proxy {
967 my $self = shift;
968 my ($host,$port) = @_;
969 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
970 $self->{pxhost} = $host;
971 $self->{pxport} = $port;
972 }
973
974
975 =head2 set_timeout
976
977 Specify timeout of connection in seconds
978
979 $node->set_timeout( 15 );
980
981 =cut
982
983 sub set_timeout {
984 my $self = shift;
985 my $sec = shift;
986 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
987 $self->{timeout} = $sec;
988 }
989
990
991 =head2 set_auth
992
993 Specify name and password for authentication to node server.
994
995 $node->set_auth('clint','eastwood');
996
997 =cut
998
999 sub set_auth {
1000 my $self = shift;
1001 my ($login,$passwd) = @_;
1002 my $basic_auth = encode_base64( "$login:$passwd" );
1003 chomp($basic_auth);
1004 $self->{auth} = $basic_auth;
1005 }
1006
1007
1008 =head2 status
1009
1010 Return status code of last request.
1011
1012 print $node->status;
1013
1014 C<-1> means connection failure.
1015
1016 =cut
1017
1018 sub status {
1019 my $self = shift;
1020 return $self->{status};
1021 }
1022
1023
1024 =head2 put_doc
1025
1026 Add a document
1027
1028 $node->put_doc( $document_draft ) or die "can't add document";
1029
1030 Return true on success or false on failture.
1031
1032 =cut
1033
1034 sub put_doc {
1035 my $self = shift;
1036 my $doc = shift || return;
1037 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1038 $self->shuttle_url( $self->{url} . '/put_doc',
1039 'text/x-estraier-draft',
1040 $doc->dump_draft,
1041 undef
1042 ) == 200;
1043 }
1044
1045
1046 =head2 out_doc
1047
1048 Remove a document
1049
1050 $node->out_doc( document_id ) or "can't remove document";
1051
1052 Return true on success or false on failture.
1053
1054 =cut
1055
1056 sub out_doc {
1057 my $self = shift;
1058 my $id = shift || return;
1059 return unless ($self->{url});
1060 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1061 $self->shuttle_url( $self->{url} . '/out_doc',
1062 'application/x-www-form-urlencoded',
1063 "id=$id",
1064 undef
1065 ) == 200;
1066 }
1067
1068
1069 =head2 out_doc_by_uri
1070
1071 Remove a registrated document using it's uri
1072
1073 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1074
1075 Return true on success or false on failture.
1076
1077 =cut
1078
1079 sub out_doc_by_uri {
1080 my $self = shift;
1081 my $uri = shift || return;
1082 return unless ($self->{url});
1083 $self->shuttle_url( $self->{url} . '/out_doc',
1084 'application/x-www-form-urlencoded',
1085 "uri=" . uri_escape($uri),
1086 undef
1087 ) == 200;
1088 }
1089
1090
1091 =head2 edit_doc
1092
1093 Edit attributes of a document
1094
1095 $node->edit_doc( $document_draft ) or die "can't edit document";
1096
1097 Return true on success or false on failture.
1098
1099 =cut
1100
1101 sub edit_doc {
1102 my $self = shift;
1103 my $doc = shift || return;
1104 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1105 $self->shuttle_url( $self->{url} . '/edit_doc',
1106 'text/x-estraier-draft',
1107 $doc->dump_draft,
1108 undef
1109 ) == 200;
1110 }
1111
1112
1113 =head2 get_doc
1114
1115 Retreive document
1116
1117 my $doc = $node->get_doc( document_id ) or die "can't get document";
1118
1119 Return true on success or false on failture.
1120
1121 =cut
1122
1123 sub get_doc {
1124 my $self = shift;
1125 my $id = shift || return;
1126 return $self->_fetch_doc( id => $id );
1127 }
1128
1129
1130 =head2 get_doc_by_uri
1131
1132 Retreive document
1133
1134 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1135
1136 Return true on success or false on failture.
1137
1138 =cut
1139
1140 sub get_doc_by_uri {
1141 my $self = shift;
1142 my $uri = shift || return;
1143 return $self->_fetch_doc( uri => $uri );
1144 }
1145
1146
1147 =head2 get_doc_attr
1148
1149 Retrieve the value of an atribute from object
1150
1151 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1152 die "can't get document attribute";
1153
1154 =cut
1155
1156 sub get_doc_attr {
1157 my $self = shift;
1158 my ($id,$name) = @_;
1159 return unless ($id && $name);
1160 return $self->_fetch_doc( id => $id, attr => $name );
1161 }
1162
1163
1164 =head2 get_doc_attr_by_uri
1165
1166 Retrieve the value of an atribute from object
1167
1168 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1169 die "can't get document attribute";
1170
1171 =cut
1172
1173 sub get_doc_attr_by_uri {
1174 my $self = shift;
1175 my ($uri,$name) = @_;
1176 return unless ($uri && $name);
1177 return $self->_fetch_doc( uri => $uri, attr => $name );
1178 }
1179
1180
1181 =head2 etch_doc
1182
1183 Exctract document keywords
1184
1185 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1186
1187 =cut
1188
1189 sub etch_doc {
1190 my $self = shift;
1191 my $id = shift || return;
1192 return $self->_fetch_doc( id => $id, etch => 1 );
1193 }
1194
1195 =head2 etch_doc_by_uri
1196
1197 Retreive document
1198
1199 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1200
1201 Return true on success or false on failture.
1202
1203 =cut
1204
1205 sub etch_doc_by_uri {
1206 my $self = shift;
1207 my $uri = shift || return;
1208 return $self->_fetch_doc( uri => $uri, etch => 1 );
1209 }
1210
1211
1212 =head2 uri_to_id
1213
1214 Get ID of document specified by URI
1215
1216 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1217
1218 This method won't croak, even if using C<croak_on_error>.
1219
1220 =cut
1221
1222 sub uri_to_id {
1223 my $self = shift;
1224 my $uri = shift || return;
1225 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1226 }
1227
1228
1229 =head2 _fetch_doc
1230
1231 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1232 C<etch_doc>, C<etch_doc_by_uri>.
1233
1234 # this will decode received draft into Search::Estraier::Document object
1235 my $doc = $node->_fetch_doc( id => 42 );
1236 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1237
1238 # to extract keywords, add etch
1239 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1240 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1241
1242 # to get document attrubute add attr
1243 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1244 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1245
1246 # more general form which allows implementation of
1247 # uri_to_id
1248 my $id = $node->_fetch_doc(
1249 uri => 'file:///document/uri/42',
1250 path => '/uri_to_id',
1251 chomp_resbody => 1
1252 );
1253
1254 =cut
1255
1256 sub _fetch_doc {
1257 my $self = shift;
1258 my $a = {@_};
1259 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1260
1261 my ($arg, $resbody);
1262
1263 my $path = $a->{path} || '/get_doc';
1264 $path = '/etch_doc' if ($a->{etch});
1265
1266 if ($a->{id}) {
1267 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1268 $arg = 'id=' . $a->{id};
1269 } elsif ($a->{uri}) {
1270 $arg = 'uri=' . uri_escape($a->{uri});
1271 } else {
1272 confess "unhandled argument. Need id or uri.";
1273 }
1274
1275 if ($a->{attr}) {
1276 $path = '/get_doc_attr';
1277 $arg .= '&attr=' . uri_escape($a->{attr});
1278 $a->{chomp_resbody} = 1;
1279 }
1280
1281 my $rv = $self->shuttle_url( $self->{url} . $path,
1282 'application/x-www-form-urlencoded',
1283 $arg,
1284 \$resbody,
1285 $a->{croak_on_error},
1286 );
1287
1288 return if ($rv != 200);
1289
1290 if ($a->{etch}) {
1291 $self->{kwords} = {};
1292 return +{} unless ($resbody);
1293 foreach my $l (split(/\n/, $resbody)) {
1294 my ($k,$v) = split(/\t/, $l, 2);
1295 $self->{kwords}->{$k} = $v if ($v);
1296 }
1297 return $self->{kwords};
1298 } elsif ($a->{chomp_resbody}) {
1299 return unless (defined($resbody));
1300 chomp($resbody);
1301 return $resbody;
1302 } else {
1303 return new Search::Estraier::Document($resbody);
1304 }
1305 }
1306
1307
1308 =head2 name
1309
1310 my $node_name = $node->name;
1311
1312 =cut
1313
1314 sub name {
1315 my $self = shift;
1316 $self->_set_info unless ($self->{inform}->{name});
1317 return $self->{inform}->{name};
1318 }
1319
1320
1321 =head2 label
1322
1323 my $node_label = $node->label;
1324
1325 =cut
1326
1327 sub label {
1328 my $self = shift;
1329 $self->_set_info unless ($self->{inform}->{label});
1330 return $self->{inform}->{label};
1331 }
1332
1333
1334 =head2 doc_num
1335
1336 my $documents_in_node = $node->doc_num;
1337
1338 =cut
1339
1340 sub doc_num {
1341 my $self = shift;
1342 $self->_set_info if ($self->{inform}->{dnum} < 0);
1343 return $self->{inform}->{dnum};
1344 }
1345
1346
1347 =head2 word_num
1348
1349 my $words_in_node = $node->word_num;
1350
1351 =cut
1352
1353 sub word_num {
1354 my $self = shift;
1355 $self->_set_info if ($self->{inform}->{wnum} < 0);
1356 return $self->{inform}->{wnum};
1357 }
1358
1359
1360 =head2 size
1361
1362 my $node_size = $node->size;
1363
1364 =cut
1365
1366 sub size {
1367 my $self = shift;
1368 $self->_set_info if ($self->{inform}->{size} < 0);
1369 return $self->{inform}->{size};
1370 }
1371
1372
1373 =head2 search
1374
1375 Search documents which match condition
1376
1377 my $nres = $node->search( $cond, $depth );
1378
1379 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1380 depth for meta search.
1381
1382 Function results C<Search::Estraier::NodeResult> object.
1383
1384 =cut
1385
1386 sub search {
1387 my $self = shift;
1388 my ($cond, $depth) = @_;
1389 return unless ($cond && defined($depth) && $self->{url});
1390 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1391 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1392
1393 my $resbody;
1394
1395 my $rv = $self->shuttle_url( $self->{url} . '/search',
1396 'application/x-www-form-urlencoded',
1397 $self->cond_to_query( $cond, $depth ),
1398 \$resbody,
1399 );
1400 return if ($rv != 200);
1401
1402 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1403 my $hintsText = splice @records, 0, 2; # starts with empty record
1404 my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1405
1406 # process records
1407 my $docs = [];
1408 foreach my $record (@records)
1409 {
1410 # split into keys and snippets
1411 my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1412
1413 # create document hash
1414 my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1415 $doc->{'@keywords'} = $doc->{keywords};
1416 ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1417 $doc->{snippet} = $snippet;
1418
1419 push @$docs, new Search::Estraier::ResultDocument(
1420 attrs => $doc,
1421 uri => $doc->{'@uri'},
1422 snippet => $snippet,
1423 keywords => $doc->{'keywords'},
1424 );
1425 }
1426
1427 return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1428 }
1429
1430
1431 =head2 cond_to_query
1432
1433 Return URI encoded string generated from Search::Estraier::Condition
1434
1435 my $args = $node->cond_to_query( $cond, $depth );
1436
1437 =cut
1438
1439 sub cond_to_query {
1440 my $self = shift;
1441
1442 my $cond = shift || return;
1443 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1444 my $depth = shift;
1445
1446 my @args;
1447
1448 if (my $phrase = $cond->phrase) {
1449 push @args, 'phrase=' . uri_escape($phrase);
1450 }
1451
1452 if (my @attrs = $cond->attrs) {
1453 for my $i ( 0 .. $#attrs ) {
1454 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1455 }
1456 }
1457
1458 if (my $order = $cond->order) {
1459 push @args, 'order=' . uri_escape($order);
1460 }
1461
1462 if (my $max = $cond->max) {
1463 push @args, 'max=' . $max;
1464 } else {
1465 push @args, 'max=' . (1 << 30);
1466 }
1467
1468 if (my $options = $cond->options) {
1469 push @args, 'options=' . $options;
1470 }
1471
1472 push @args, 'depth=' . $depth if ($depth);
1473 push @args, 'wwidth=' . $self->{wwidth};
1474 push @args, 'hwidth=' . $self->{hwidth};
1475 push @args, 'awidth=' . $self->{awidth};
1476 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1477
1478 return join('&', @args);
1479 }
1480
1481
1482 =head2 shuttle_url
1483
1484 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1485 master.
1486
1487 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1488
1489 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1490 body will be saved within object.
1491
1492 =cut
1493
1494 use LWP::UserAgent;
1495
1496 sub shuttle_url {
1497 my $self = shift;
1498
1499 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1500
1501 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1502
1503 $self->{status} = -1;
1504
1505 warn "## $url\n" if ($self->{debug});
1506
1507 $url = new URI($url);
1508 if (
1509 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1510 !$url->host || !$url->port || $url->port < 1
1511 ) {
1512 carp "can't parse $url\n";
1513 return -1;
1514 }
1515
1516 my $ua = LWP::UserAgent->new;
1517 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1518
1519 my $req;
1520 if ($reqbody) {
1521 $req = HTTP::Request->new(POST => $url);
1522 } else {
1523 $req = HTTP::Request->new(GET => $url);
1524 }
1525
1526 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1527 $req->headers->header( 'Connection', 'close' );
1528 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1529 $req->content_type( $content_type );
1530
1531 warn $req->headers->as_string,"\n" if ($self->{debug});
1532
1533 if ($reqbody) {
1534 warn "$reqbody\n" if ($self->{debug});
1535 $req->content( $reqbody );
1536 }
1537
1538 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1539
1540 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1541
1542 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1543
1544 if (! $res->is_success) {
1545 if ($croak_on_error) {
1546 croak("can't get $url: ",$res->status_line);
1547 } else {
1548 return -1;
1549 }
1550 }
1551
1552 $$resbody .= $res->content;
1553
1554 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1555
1556 return $self->{status};
1557 }
1558
1559
1560 =head2 set_snippet_width
1561
1562 Set width of snippets in results
1563
1564 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1565
1566 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1567 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1568
1569 C<$hwidth> specified width of strings from beginning of string. Default
1570 value is C<96>. Negative or zero value keep previous value.
1571
1572 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1573 If negative of zero value is provided previous value is kept unchanged.
1574
1575 =cut
1576
1577 sub set_snippet_width {
1578 my $self = shift;
1579
1580 my ($wwidth, $hwidth, $awidth) = @_;
1581 $self->{wwidth} = $wwidth;
1582 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1583 $self->{awidth} = $awidth if ($awidth >= 0);
1584 }
1585
1586
1587 =head2 set_user
1588
1589 Manage users of node
1590
1591 $node->set_user( 'name', $mode );
1592
1593 C<$mode> can be one of:
1594
1595 =over 4
1596
1597 =item 0
1598
1599 delete account
1600
1601 =item 1
1602
1603 set administrative right for user
1604
1605 =item 2
1606
1607 set user account as guest
1608
1609 =back
1610
1611 Return true on success, otherwise false.
1612
1613 =cut
1614
1615 sub set_user {
1616 my $self = shift;
1617 my ($name, $mode) = @_;
1618
1619 return unless ($self->{url});
1620 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1621
1622 $self->shuttle_url( $self->{url} . '/_set_user',
1623 'text/plain',
1624 'name=' . uri_escape($name) . '&mode=' . $mode,
1625 undef
1626 ) == 200;
1627 }
1628
1629
1630 =head2 set_link
1631
1632 Manage node links
1633
1634 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1635
1636 If C<$credit> is negative, link is removed.
1637
1638 =cut
1639
1640 sub set_link {
1641 my $self = shift;
1642 my ($url, $label, $credit) = @_;
1643
1644 return unless ($self->{url});
1645 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1646
1647 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1648 $reqbody .= '&credit=' . $credit if ($credit > 0);
1649
1650 if ($self->shuttle_url( $self->{url} . '/_set_link',
1651 'application/x-www-form-urlencoded',
1652 $reqbody,
1653 undef
1654 ) == 200) {
1655 # refresh node info after adding link
1656 $self->_set_info;
1657 return 1;
1658 }
1659 }
1660
1661 =head2 admins
1662
1663 my @admins = @{ $node->admins };
1664
1665 Return array of users with admin rights on node
1666
1667 =cut
1668
1669 sub admins {
1670 my $self = shift;
1671 $self->_set_info unless ($self->{inform}->{name});
1672 return $self->{inform}->{admins};
1673 }
1674
1675 =head2 guests
1676
1677 my @guests = @{ $node->guests };
1678
1679 Return array of users with guest rights on node
1680
1681 =cut
1682
1683 sub guests {
1684 my $self = shift;
1685 $self->_set_info unless ($self->{inform}->{name});
1686 return $self->{inform}->{guests};
1687 }
1688
1689 =head2 links
1690
1691 my $links = @{ $node->links };
1692
1693 Return array of links for this node
1694
1695 =cut
1696
1697 sub links {
1698 my $self = shift;
1699 $self->_set_info unless ($self->{inform}->{name});
1700 return $self->{inform}->{links};
1701 }
1702
1703 =head2 master
1704
1705 Set actions on Hyper Estraier node master (C<estmaster> process)
1706
1707 $node->master(
1708 action => 'sync'
1709 );
1710
1711 All available actions are documented in
1712 L<http://hyperestraier.sourceforge.net/nguide-en.html#protocol>
1713
1714 =cut
1715
1716 my $estmaster_rest = {
1717 shutdown => {
1718 status => 202,
1719 },
1720 sync => {
1721 status => 202,
1722 },
1723 backup => {
1724 status => 202,
1725 },
1726 userlist => {
1727 status => 200,
1728 returns => qw/name passwd flags fname misc/,
1729 },
1730 useradd => {
1731 required => qw/name passwd flags/,
1732 optional => qw/fname misc/,
1733 status => 200,
1734 },
1735 userdel => {
1736 required => qw/name/,
1737 status => 200,
1738 },
1739 nodelist => {
1740 status => 200,
1741 returns => qw/name label doc_num word_num size/,
1742 },
1743 nodeadd => {
1744 required => qw/name/,
1745 optional => qw/label/,
1746 status => 200,
1747 },
1748 nodedel => {
1749 required => qw/name/,
1750 status => 200,
1751 },
1752 nodeclr => {
1753 required => qw/name/,
1754 status => 200,
1755 },
1756 nodertt => {
1757 status => 200,
1758 },
1759 };
1760
1761 sub master {
1762 my $self = shift;
1763
1764 my $args = {@_};
1765
1766 # have action?
1767 my $action = $args->{action} || croak "need action, available: ",
1768 join(", ",keys %{ $estmaster_rest });
1769
1770 # check if action is valid
1771 my $rest = $estmaster_rest->{$action};
1772 croak "action '$action' is not supported, available actions: ",
1773 join(", ",keys %{ $estmaster_rest }) unless ($rest);
1774
1775 croak "BUG: action '$action' needs return status" unless ($rest->{status});
1776
1777 my @args;
1778
1779 if ($rest->{required} || $rest->{optional}) {
1780
1781 map {
1782 croak "need parametar '$_' for action '$action'" unless ($args->{$_});
1783 push @args, $_ . '=' . uri_escape( $args->{$_} );
1784 } ( keys %{ $rest->{required} } );
1785
1786 map {
1787 push @args, $_ . '=' . uri_escape( $args->{$_} ) if ($args->{$_});
1788 } ( keys %{ $rest->{optional} } );
1789
1790 }
1791
1792 my $uri = new URI( $self->{url} );
1793
1794 my $resbody;
1795
1796 if ($self->shuttle_url(
1797 'http://' . $uri->host_port . '/master?action=' . $action ,
1798 'application/x-www-form-urlencoded',
1799 join('&', @args),
1800 \$resbody,
1801 1,
1802 ) == $rest->{status}) {
1803 return 0E0 unless ($rest->{returns});
1804
1805 if (wantarray) {
1806
1807 my @results;
1808
1809 foreach my $line ( split(/[\r\n]/,$resbody) ) {
1810 my @e = split(/\t/, $line);
1811 my $row;
1812 map { $row->{$_} = shift @e; } @{ $rest->{returns} };
1813 push @results, $row;
1814 }
1815
1816 return @results;
1817 } else {
1818
1819 carp "calling master action '$action', but not expecting array back, returning whole body";
1820 return $resbody;
1821 }
1822 }
1823 }
1824
1825 =head1 PRIVATE METHODS
1826
1827 You could call those directly, but you don't have to. I hope.
1828
1829 =head2 _set_info
1830
1831 Set information for node
1832
1833 $node->_set_info;
1834
1835 =cut
1836
1837 sub _set_info {
1838 my $self = shift;
1839
1840 $self->{status} = -1;
1841 return unless ($self->{url});
1842
1843 my $resbody;
1844 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1845 'text/plain',
1846 undef,
1847 \$resbody,
1848 );
1849
1850 return if ($rv != 200 || !$resbody);
1851
1852 my @lines = split(/[\r\n]/,$resbody);
1853
1854 $self->{inform} = {};
1855
1856 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1857 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1858
1859 return $resbody unless (@lines);
1860
1861 shift @lines;
1862
1863 while(my $admin = shift @lines) {
1864 push @{$self->{inform}->{admins}}, $admin;
1865 }
1866
1867 while(my $guest = shift @lines) {
1868 push @{$self->{inform}->{guests}}, $guest;
1869 }
1870
1871 while(my $link = shift @lines) {
1872 push @{$self->{inform}->{links}}, $link;
1873 }
1874
1875 return $resbody;
1876
1877 }
1878
1879 ###
1880
1881 =head1 EXPORT
1882
1883 Nothing.
1884
1885 =head1 SEE ALSO
1886
1887 L<http://hyperestraier.sourceforge.net/>
1888
1889 Hyper Estraier Ruby interface on which this module is based.
1890
1891 =head1 AUTHOR
1892
1893 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1894
1895 Robert Klep E<lt>robert@klep.nameE<gt> contributed refactored search code
1896
1897 =head1 COPYRIGHT AND LICENSE
1898
1899 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1900
1901 This library is free software; you can redistribute it and/or modify
1902 it under the GPL v2 or later.
1903
1904 =cut
1905
1906 1;

  ViewVC Help
Powered by ViewVC 1.1.26