/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 140 - (show annotations)
Wed May 10 14:08:34 2006 UTC (17 years, 10 months ago) by dpavlin
File size: 34740 byte(s)
fix interaction of create and croak_on_error, added tests for it
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin',
24 create => 1,
25 label => 'Label for node',
26 croak_on_error => 1,
27 );
28
29 # create document
30 my $doc = new Search::Estraier::Document;
31
32 # add attributes
33 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
34 $doc->add_attr('@title', "Over the Rainbow");
35
36 # add body text to document
37 $doc->add_text("Somewhere over the rainbow. Way up high.");
38 $doc->add_text("There's a land that I heard of once in a lullaby.");
39
40 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
41
42 =head2 Simple searcher
43
44 use Search::Estraier;
45
46 # create and configure node
47 my $node = new Search::Estraier::Node(
48 url => 'http://localhost:1978/node/test',
49 user => 'admin',
50 passwd => 'admin',
51 croak_on_error => 1,
52 );
53
54 # create condition
55 my $cond = new Search::Estraier::Condition;
56
57 # set search phrase
58 $cond->set_phrase("rainbow AND lullaby");
59
60 my $nres = $node->search($cond, 0);
61
62 if (defined($nres)) {
63 print "Got ", $nres->hits, " results\n";
64
65 # for each document in results
66 for my $i ( 0 ... $nres->doc_num - 1 ) {
67 # get result document
68 my $rdoc = $nres->get_doc($i);
69 # display attribte
70 print "URI: ", $rdoc->attr('@uri'),"\n";
71 print "Title: ", $rdoc->attr('@title'),"\n";
72 print $rdoc->snippet,"\n";
73 }
74 } else {
75 die "error: ", $node->status,"\n";
76 }
77
78 =head1 DESCRIPTION
79
80 This module is implementation of node API of Hyper Estraier. Since it's
81 perl-only module with dependencies only on standard perl modules, it will
82 run on all platforms on which perl runs. It doesn't require compilation
83 or Hyper Estraier development files on target machine.
84
85 It is implemented as multiple packages which closly resamble Ruby
86 implementation. It also includes methods to manage nodes.
87
88 There are few examples in C<scripts> directory of this distribution.
89
90 =cut
91
92 =head1 Inheritable common methods
93
94 This methods should really move somewhere else.
95
96 =head2 _s
97
98 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
99
100 my $text = $self->_s(" this is a text ");
101 $text = 'this is a text';
102
103 =cut
104
105 sub _s {
106 my $text = $_[1];
107 return unless defined($text);
108 $text =~ s/\s\s+/ /gs;
109 $text =~ s/^\s+//;
110 $text =~ s/\s+$//;
111 return $text;
112 }
113
114 package Search::Estraier::Document;
115
116 use Carp qw/croak confess/;
117
118 use Search::Estraier;
119 our @ISA = qw/Search::Estraier/;
120
121 =head1 Search::Estraier::Document
122
123 This class implements Document which is collection of attributes
124 (key=value), vectors (also key value) display text and hidden text.
125
126
127 =head2 new
128
129 Create new document, empty or from draft.
130
131 my $doc = new Search::HyperEstraier::Document;
132 my $doc2 = new Search::HyperEstraier::Document( $draft );
133
134 =cut
135
136 sub new {
137 my $class = shift;
138 my $self = {};
139 bless($self, $class);
140
141 $self->{id} = -1;
142
143 my $draft = shift;
144
145 if ($draft) {
146 my $in_text = 0;
147 foreach my $line (split(/\n/, $draft)) {
148
149 if ($in_text) {
150 if ($line =~ /^\t/) {
151 push @{ $self->{htexts} }, substr($line, 1);
152 } else {
153 push @{ $self->{dtexts} }, $line;
154 }
155 next;
156 }
157
158 if ($line =~ m/^%VECTOR\t(.+)$/) {
159 my @fields = split(/\t/, $1);
160 for my $i ( 0 .. ($#fields - 1) ) {
161 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
162 $i++;
163 }
164 next;
165 } elsif ($line =~ m/^%/) {
166 # What is this? comment?
167 #warn "$line\n";
168 next;
169 } elsif ($line =~ m/^$/) {
170 $in_text = 1;
171 next;
172 } elsif ($line =~ m/^(.+)=(.*)$/) {
173 $self->{attrs}->{ $1 } = $2;
174 next;
175 }
176
177 warn "draft ignored: '$line'\n";
178 }
179 }
180
181 $self ? return $self : return undef;
182 }
183
184
185 =head2 add_attr
186
187 Add an attribute.
188
189 $doc->add_attr( name => 'value' );
190
191 Delete attribute using
192
193 $doc->add_attr( name => undef );
194
195 =cut
196
197 sub add_attr {
198 my $self = shift;
199 my $attrs = {@_};
200
201 while (my ($name, $value) = each %{ $attrs }) {
202 if (! defined($value)) {
203 delete( $self->{attrs}->{ $self->_s($name) } );
204 } else {
205 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
206 }
207 }
208
209 return 1;
210 }
211
212
213 =head2 add_text
214
215 Add a sentence of text.
216
217 $doc->add_text('this is example text to display');
218
219 =cut
220
221 sub add_text {
222 my $self = shift;
223 my $text = shift;
224 return unless defined($text);
225
226 push @{ $self->{dtexts} }, $self->_s($text);
227 }
228
229
230 =head2 add_hidden_text
231
232 Add a hidden sentence.
233
234 $doc->add_hidden_text('this is example text just for search');
235
236 =cut
237
238 sub add_hidden_text {
239 my $self = shift;
240 my $text = shift;
241 return unless defined($text);
242
243 push @{ $self->{htexts} }, $self->_s($text);
244 }
245
246
247 =head2 id
248
249 Get the ID number of document. If the object has never been registred, C<-1> is returned.
250
251 print $doc->id;
252
253 =cut
254
255 sub id {
256 my $self = shift;
257 return $self->{id};
258 }
259
260
261 =head2 attr_names
262
263 Returns array with attribute names from document object.
264
265 my @attrs = $doc->attr_names;
266
267 =cut
268
269 sub attr_names {
270 my $self = shift;
271 return unless ($self->{attrs});
272 #croak "attr_names return array, not scalar" if (! wantarray);
273 return sort keys %{ $self->{attrs} };
274 }
275
276
277 =head2 attr
278
279 Returns value of an attribute.
280
281 my $value = $doc->attr( 'attribute' );
282
283 =cut
284
285 sub attr {
286 my $self = shift;
287 my $name = shift;
288 return unless (defined($name) && $self->{attrs});
289 return $self->{attrs}->{ $name };
290 }
291
292
293 =head2 texts
294
295 Returns array with text sentences.
296
297 my @texts = $doc->texts;
298
299 =cut
300
301 sub texts {
302 my $self = shift;
303 #confess "texts return array, not scalar" if (! wantarray);
304 return @{ $self->{dtexts} } if ($self->{dtexts});
305 }
306
307
308 =head2 cat_texts
309
310 Return whole text as single scalar.
311
312 my $text = $doc->cat_texts;
313
314 =cut
315
316 sub cat_texts {
317 my $self = shift;
318 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
319 }
320
321
322 =head2 dump_draft
323
324 Dump draft data from document object.
325
326 print $doc->dump_draft;
327
328 =cut
329
330 sub dump_draft {
331 my $self = shift;
332 my $draft;
333
334 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
335 next unless defined(my $v = $self->{attrs}->{$attr_name});
336 $draft .= $attr_name . '=' . $v . "\n";
337 }
338
339 if ($self->{kwords}) {
340 $draft .= '%%VECTOR';
341 while (my ($key, $value) = each %{ $self->{kwords} }) {
342 $draft .= "\t$key\t$value";
343 }
344 $draft .= "\n";
345 }
346
347 $draft .= "\n";
348
349 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
350 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
351
352 return $draft;
353 }
354
355
356 =head2 delete
357
358 Empty document object
359
360 $doc->delete;
361
362 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
363 convinience. Document objects which go out of scope will be destroyed
364 automatically.
365
366 =cut
367
368 sub delete {
369 my $self = shift;
370
371 foreach my $data (qw/attrs dtexts stexts kwords/) {
372 delete($self->{$data});
373 }
374
375 $self->{id} = -1;
376
377 return 1;
378 }
379
380
381
382 package Search::Estraier::Condition;
383
384 use Carp qw/carp confess croak/;
385
386 use Search::Estraier;
387 our @ISA = qw/Search::Estraier/;
388
389 =head1 Search::Estraier::Condition
390
391 =head2 new
392
393 my $cond = new Search::HyperEstraier::Condition;
394
395 =cut
396
397 sub new {
398 my $class = shift;
399 my $self = {};
400 bless($self, $class);
401
402 $self->{max} = -1;
403 $self->{options} = 0;
404
405 $self ? return $self : return undef;
406 }
407
408
409 =head2 set_phrase
410
411 $cond->set_phrase('search phrase');
412
413 =cut
414
415 sub set_phrase {
416 my $self = shift;
417 $self->{phrase} = $self->_s( shift );
418 }
419
420
421 =head2 add_attr
422
423 $cond->add_attr('@URI STRINC /~dpavlin/');
424
425 =cut
426
427 sub add_attr {
428 my $self = shift;
429 my $attr = shift || return;
430 push @{ $self->{attrs} }, $self->_s( $attr );
431 }
432
433
434 =head2 set_order
435
436 $cond->set_order('@mdate NUMD');
437
438 =cut
439
440 sub set_order {
441 my $self = shift;
442 $self->{order} = shift;
443 }
444
445
446 =head2 set_max
447
448 $cond->set_max(42);
449
450 =cut
451
452 sub set_max {
453 my $self = shift;
454 my $max = shift;
455 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
456 $self->{max} = $max;
457 }
458
459
460 =head2 set_options
461
462 $cond->set_options( 'SURE' );
463
464 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
465
466 Possible options are:
467
468 =over 8
469
470 =item SURE
471
472 check every N-gram
473
474 =item USUAL
475
476 check every second N-gram
477
478 =item FAST
479
480 check every third N-gram
481
482 =item AGITO
483
484 check every fourth N-gram
485
486 =item NOIDF
487
488 don't perform TF-IDF tuning
489
490 =item SIMPLE
491
492 use simplified query phrase
493
494 =back
495
496 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
497 options;
498
499 This option changed in version C<0.04> of this module. It's backwards compatibile.
500
501 =cut
502
503 my $options = {
504 SURE => 1 << 0,
505 USUAL => 1 << 1,
506 FAST => 1 << 2,
507 AGITO => 1 << 3,
508 NOIDF => 1 << 4,
509 SIMPLE => 1 << 10,
510 };
511
512 sub set_options {
513 my $self = shift;
514 my $opt = 0;
515 foreach my $option (@_) {
516 my $mask;
517 unless ($mask = $options->{$option}) {
518 if ($option eq '1') {
519 next;
520 } else {
521 croak "unknown option $option";
522 }
523 }
524 $opt += $mask;
525 }
526 $self->{options} = $opt;
527 }
528
529
530 =head2 phrase
531
532 Return search phrase.
533
534 print $cond->phrase;
535
536 =cut
537
538 sub phrase {
539 my $self = shift;
540 return $self->{phrase};
541 }
542
543
544 =head2 order
545
546 Return search result order.
547
548 print $cond->order;
549
550 =cut
551
552 sub order {
553 my $self = shift;
554 return $self->{order};
555 }
556
557
558 =head2 attrs
559
560 Return search result attrs.
561
562 my @cond_attrs = $cond->attrs;
563
564 =cut
565
566 sub attrs {
567 my $self = shift;
568 #croak "attrs return array, not scalar" if (! wantarray);
569 return @{ $self->{attrs} } if ($self->{attrs});
570 }
571
572
573 =head2 max
574
575 Return maximum number of results.
576
577 print $cond->max;
578
579 C<-1> is returned for unitialized value, C<0> is unlimited.
580
581 =cut
582
583 sub max {
584 my $self = shift;
585 return $self->{max};
586 }
587
588
589 =head2 options
590
591 Return options for this condition.
592
593 print $cond->options;
594
595 Options are returned in numerical form.
596
597 =cut
598
599 sub options {
600 my $self = shift;
601 return $self->{options};
602 }
603
604
605 =head2 set_skip
606
607 Set number of skipped documents from beginning of results
608
609 $cond->set_skip(42);
610
611 Similar to C<offset> in RDBMS.
612
613 =cut
614
615 sub set_skip {
616 my $self = shift;
617 $self->{skip} = shift;
618 }
619
620 =head2 skip
621
622 Return skip for this condition.
623
624 print $cond->skip;
625
626 =cut
627
628 sub skip {
629 my $self = shift;
630 return $self->{skip};
631 }
632
633
634 package Search::Estraier::ResultDocument;
635
636 use Carp qw/croak/;
637
638 #use Search::Estraier;
639 #our @ISA = qw/Search::Estraier/;
640
641 =head1 Search::Estraier::ResultDocument
642
643 =head2 new
644
645 my $rdoc = new Search::HyperEstraier::ResultDocument(
646 uri => 'http://localhost/document/uri/42',
647 attrs => {
648 foo => 1,
649 bar => 2,
650 },
651 snippet => 'this is a text of snippet'
652 keywords => 'this\tare\tkeywords'
653 );
654
655 =cut
656
657 sub new {
658 my $class = shift;
659 my $self = {@_};
660 bless($self, $class);
661
662 croak "missing uri for ResultDocument" unless defined($self->{uri});
663
664 $self ? return $self : return undef;
665 }
666
667
668 =head2 uri
669
670 Return URI of result document
671
672 print $rdoc->uri;
673
674 =cut
675
676 sub uri {
677 my $self = shift;
678 return $self->{uri};
679 }
680
681
682 =head2 attr_names
683
684 Returns array with attribute names from result document object.
685
686 my @attrs = $rdoc->attr_names;
687
688 =cut
689
690 sub attr_names {
691 my $self = shift;
692 croak "attr_names return array, not scalar" if (! wantarray);
693 return sort keys %{ $self->{attrs} };
694 }
695
696
697 =head2 attr
698
699 Returns value of an attribute.
700
701 my $value = $rdoc->attr( 'attribute' );
702
703 =cut
704
705 sub attr {
706 my $self = shift;
707 my $name = shift || return;
708 return $self->{attrs}->{ $name };
709 }
710
711
712 =head2 snippet
713
714 Return snippet from result document
715
716 print $rdoc->snippet;
717
718 =cut
719
720 sub snippet {
721 my $self = shift;
722 return $self->{snippet};
723 }
724
725
726 =head2 keywords
727
728 Return keywords from result document
729
730 print $rdoc->keywords;
731
732 =cut
733
734 sub keywords {
735 my $self = shift;
736 return $self->{keywords};
737 }
738
739
740 package Search::Estraier::NodeResult;
741
742 use Carp qw/croak/;
743
744 #use Search::Estraier;
745 #our @ISA = qw/Search::Estraier/;
746
747 =head1 Search::Estraier::NodeResult
748
749 =head2 new
750
751 my $res = new Search::HyperEstraier::NodeResult(
752 docs => @array_of_rdocs,
753 hits => %hash_with_hints,
754 );
755
756 =cut
757
758 sub new {
759 my $class = shift;
760 my $self = {@_};
761 bless($self, $class);
762
763 foreach my $f (qw/docs hints/) {
764 croak "missing $f for ResultDocument" unless defined($self->{$f});
765 }
766
767 $self ? return $self : return undef;
768 }
769
770
771 =head2 doc_num
772
773 Return number of documents
774
775 print $res->doc_num;
776
777 This will return real number of documents (limited by C<max>).
778 If you want to get total number of hits, see C<hits>.
779
780 =cut
781
782 sub doc_num {
783 my $self = shift;
784 return $#{$self->{docs}} + 1;
785 }
786
787
788 =head2 get_doc
789
790 Return single document
791
792 my $doc = $res->get_doc( 42 );
793
794 Returns undef if document doesn't exist.
795
796 =cut
797
798 sub get_doc {
799 my $self = shift;
800 my $num = shift;
801 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
802 return undef if ($num < 0 || $num > $self->{docs});
803 return $self->{docs}->[$num];
804 }
805
806
807 =head2 hint
808
809 Return specific hint from results.
810
811 print $res->hint( 'VERSION' );
812
813 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
814 C<TIME>, C<LINK#n>, C<VIEW>.
815
816 =cut
817
818 sub hint {
819 my $self = shift;
820 my $key = shift || return;
821 return $self->{hints}->{$key};
822 }
823
824 =head2 hints
825
826 More perlish version of C<hint>. This one returns hash.
827
828 my %hints = $res->hints;
829
830 =cut
831
832 sub hints {
833 my $self = shift;
834 return $self->{hints};
835 }
836
837 =head2 hits
838
839 Syntaxtic sugar for total number of hits for this query
840
841 print $res->hits;
842
843 It's same as
844
845 print $res->hint('HIT');
846
847 but shorter.
848
849 =cut
850
851 sub hits {
852 my $self = shift;
853 return $self->{hints}->{'HIT'} || 0;
854 }
855
856 package Search::Estraier::Node;
857
858 use Carp qw/carp croak confess/;
859 use URI;
860 use MIME::Base64;
861 use IO::Socket::INET;
862 use URI::Escape qw/uri_escape/;
863
864 =head1 Search::Estraier::Node
865
866 =head2 new
867
868 my $node = new Search::HyperEstraier::Node;
869
870 or optionally with C<url> as parametar
871
872 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
873
874 or in more verbose form
875
876 my $node = new Search::HyperEstraier::Node(
877 url => 'http://localhost:1978/node/test',
878 user => 'admin',
879 passwd => 'admin'
880 create => 1,
881 label => 'optional node label',
882 debug => 1,
883 croak_on_error => 1
884 );
885
886 with following arguments:
887
888 =over 4
889
890 =item url
891
892 URL to node
893
894 =item user
895
896 specify username for node server authentication
897
898 =item passwd
899
900 password for authentication
901
902 =item create
903
904 create node if it doesn't exists
905
906 =item label
907
908 optional label for new node if C<create> is used
909
910 =item debug
911
912 dumps a B<lot> of debugging output
913
914 =item croak_on_error
915
916 very helpful during development. It will croak on all errors instead of
917 silently returning C<-1> (which is convention of Hyper Estraier API in other
918 languages).
919
920 =back
921
922 =cut
923
924 sub new {
925 my $class = shift;
926 my $self = {
927 pxport => -1,
928 timeout => 0, # this used to be -1
929 wwidth => 480,
930 hwidth => 96,
931 awidth => 96,
932 status => -1,
933 };
934
935 bless($self, $class);
936
937 if ($#_ == 0) {
938 $self->{url} = shift;
939 } else {
940 %$self = ( %$self, @_ );
941
942 $self->set_auth( $self->{user}, $self->{passwd} ) if ($self->{user});
943
944 warn "## Node debug on\n" if ($self->{debug});
945 }
946
947 $self->{inform} = {
948 dnum => -1,
949 wnum => -1,
950 size => -1.0,
951 };
952
953 if ($self->{create}) {
954 if (! eval { $self->name } || $@) {
955 my $name = $1 if ($self->{url} =~ m#/node/([^/]+)/*#);
956 croak "can't find node name in '$self->{url}'" unless ($name);
957 my $label = $self->{label} || $name;
958 $self->master(
959 action => 'nodeadd',
960 name => $name,
961 label => $label,
962 ) || croak "can't create node $name ($label)";
963 }
964 }
965
966 $self ? return $self : return undef;
967 }
968
969
970 =head2 set_url
971
972 Specify URL to node server
973
974 $node->set_url('http://localhost:1978');
975
976 =cut
977
978 sub set_url {
979 my $self = shift;
980 $self->{url} = shift;
981 }
982
983
984 =head2 set_proxy
985
986 Specify proxy server to connect to node server
987
988 $node->set_proxy('proxy.example.com', 8080);
989
990 =cut
991
992 sub set_proxy {
993 my $self = shift;
994 my ($host,$port) = @_;
995 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
996 $self->{pxhost} = $host;
997 $self->{pxport} = $port;
998 }
999
1000
1001 =head2 set_timeout
1002
1003 Specify timeout of connection in seconds
1004
1005 $node->set_timeout( 15 );
1006
1007 =cut
1008
1009 sub set_timeout {
1010 my $self = shift;
1011 my $sec = shift;
1012 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
1013 $self->{timeout} = $sec;
1014 }
1015
1016
1017 =head2 set_auth
1018
1019 Specify name and password for authentication to node server.
1020
1021 $node->set_auth('clint','eastwood');
1022
1023 =cut
1024
1025 sub set_auth {
1026 my $self = shift;
1027 my ($login,$passwd) = @_;
1028 my $basic_auth = encode_base64( "$login:$passwd" );
1029 chomp($basic_auth);
1030 $self->{auth} = $basic_auth;
1031 }
1032
1033
1034 =head2 status
1035
1036 Return status code of last request.
1037
1038 print $node->status;
1039
1040 C<-1> means connection failure.
1041
1042 =cut
1043
1044 sub status {
1045 my $self = shift;
1046 return $self->{status};
1047 }
1048
1049
1050 =head2 put_doc
1051
1052 Add a document
1053
1054 $node->put_doc( $document_draft ) or die "can't add document";
1055
1056 Return true on success or false on failture.
1057
1058 =cut
1059
1060 sub put_doc {
1061 my $self = shift;
1062 my $doc = shift || return;
1063 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1064 $self->shuttle_url( $self->{url} . '/put_doc',
1065 'text/x-estraier-draft',
1066 $doc->dump_draft,
1067 undef
1068 ) == 200;
1069 }
1070
1071
1072 =head2 out_doc
1073
1074 Remove a document
1075
1076 $node->out_doc( document_id ) or "can't remove document";
1077
1078 Return true on success or false on failture.
1079
1080 =cut
1081
1082 sub out_doc {
1083 my $self = shift;
1084 my $id = shift || return;
1085 return unless ($self->{url});
1086 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1087 $self->shuttle_url( $self->{url} . '/out_doc',
1088 'application/x-www-form-urlencoded',
1089 "id=$id",
1090 undef
1091 ) == 200;
1092 }
1093
1094
1095 =head2 out_doc_by_uri
1096
1097 Remove a registrated document using it's uri
1098
1099 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1100
1101 Return true on success or false on failture.
1102
1103 =cut
1104
1105 sub out_doc_by_uri {
1106 my $self = shift;
1107 my $uri = shift || return;
1108 return unless ($self->{url});
1109 $self->shuttle_url( $self->{url} . '/out_doc',
1110 'application/x-www-form-urlencoded',
1111 "uri=" . uri_escape($uri),
1112 undef
1113 ) == 200;
1114 }
1115
1116
1117 =head2 edit_doc
1118
1119 Edit attributes of a document
1120
1121 $node->edit_doc( $document_draft ) or die "can't edit document";
1122
1123 Return true on success or false on failture.
1124
1125 =cut
1126
1127 sub edit_doc {
1128 my $self = shift;
1129 my $doc = shift || return;
1130 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1131 $self->shuttle_url( $self->{url} . '/edit_doc',
1132 'text/x-estraier-draft',
1133 $doc->dump_draft,
1134 undef
1135 ) == 200;
1136 }
1137
1138
1139 =head2 get_doc
1140
1141 Retreive document
1142
1143 my $doc = $node->get_doc( document_id ) or die "can't get document";
1144
1145 Return true on success or false on failture.
1146
1147 =cut
1148
1149 sub get_doc {
1150 my $self = shift;
1151 my $id = shift || return;
1152 return $self->_fetch_doc( id => $id );
1153 }
1154
1155
1156 =head2 get_doc_by_uri
1157
1158 Retreive document
1159
1160 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1161
1162 Return true on success or false on failture.
1163
1164 =cut
1165
1166 sub get_doc_by_uri {
1167 my $self = shift;
1168 my $uri = shift || return;
1169 return $self->_fetch_doc( uri => $uri );
1170 }
1171
1172
1173 =head2 get_doc_attr
1174
1175 Retrieve the value of an atribute from object
1176
1177 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1178 die "can't get document attribute";
1179
1180 =cut
1181
1182 sub get_doc_attr {
1183 my $self = shift;
1184 my ($id,$name) = @_;
1185 return unless ($id && $name);
1186 return $self->_fetch_doc( id => $id, attr => $name );
1187 }
1188
1189
1190 =head2 get_doc_attr_by_uri
1191
1192 Retrieve the value of an atribute from object
1193
1194 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1195 die "can't get document attribute";
1196
1197 =cut
1198
1199 sub get_doc_attr_by_uri {
1200 my $self = shift;
1201 my ($uri,$name) = @_;
1202 return unless ($uri && $name);
1203 return $self->_fetch_doc( uri => $uri, attr => $name );
1204 }
1205
1206
1207 =head2 etch_doc
1208
1209 Exctract document keywords
1210
1211 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1212
1213 =cut
1214
1215 sub etch_doc {
1216 my $self = shift;
1217 my $id = shift || return;
1218 return $self->_fetch_doc( id => $id, etch => 1 );
1219 }
1220
1221 =head2 etch_doc_by_uri
1222
1223 Retreive document
1224
1225 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1226
1227 Return true on success or false on failture.
1228
1229 =cut
1230
1231 sub etch_doc_by_uri {
1232 my $self = shift;
1233 my $uri = shift || return;
1234 return $self->_fetch_doc( uri => $uri, etch => 1 );
1235 }
1236
1237
1238 =head2 uri_to_id
1239
1240 Get ID of document specified by URI
1241
1242 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1243
1244 This method won't croak, even if using C<croak_on_error>.
1245
1246 =cut
1247
1248 sub uri_to_id {
1249 my $self = shift;
1250 my $uri = shift || return;
1251 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1252 }
1253
1254
1255 =head2 _fetch_doc
1256
1257 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1258 C<etch_doc>, C<etch_doc_by_uri>.
1259
1260 # this will decode received draft into Search::Estraier::Document object
1261 my $doc = $node->_fetch_doc( id => 42 );
1262 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1263
1264 # to extract keywords, add etch
1265 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1266 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1267
1268 # to get document attrubute add attr
1269 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1270 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1271
1272 # more general form which allows implementation of
1273 # uri_to_id
1274 my $id = $node->_fetch_doc(
1275 uri => 'file:///document/uri/42',
1276 path => '/uri_to_id',
1277 chomp_resbody => 1
1278 );
1279
1280 =cut
1281
1282 sub _fetch_doc {
1283 my $self = shift;
1284 my $a = {@_};
1285 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1286
1287 my ($arg, $resbody);
1288
1289 my $path = $a->{path} || '/get_doc';
1290 $path = '/etch_doc' if ($a->{etch});
1291
1292 if ($a->{id}) {
1293 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1294 $arg = 'id=' . $a->{id};
1295 } elsif ($a->{uri}) {
1296 $arg = 'uri=' . uri_escape($a->{uri});
1297 } else {
1298 confess "unhandled argument. Need id or uri.";
1299 }
1300
1301 if ($a->{attr}) {
1302 $path = '/get_doc_attr';
1303 $arg .= '&attr=' . uri_escape($a->{attr});
1304 $a->{chomp_resbody} = 1;
1305 }
1306
1307 my $rv = $self->shuttle_url( $self->{url} . $path,
1308 'application/x-www-form-urlencoded',
1309 $arg,
1310 \$resbody,
1311 $a->{croak_on_error},
1312 );
1313
1314 return if ($rv != 200);
1315
1316 if ($a->{etch}) {
1317 $self->{kwords} = {};
1318 return +{} unless ($resbody);
1319 foreach my $l (split(/\n/, $resbody)) {
1320 my ($k,$v) = split(/\t/, $l, 2);
1321 $self->{kwords}->{$k} = $v if ($v);
1322 }
1323 return $self->{kwords};
1324 } elsif ($a->{chomp_resbody}) {
1325 return unless (defined($resbody));
1326 chomp($resbody);
1327 return $resbody;
1328 } else {
1329 return new Search::Estraier::Document($resbody);
1330 }
1331 }
1332
1333
1334 =head2 name
1335
1336 my $node_name = $node->name;
1337
1338 =cut
1339
1340 sub name {
1341 my $self = shift;
1342 $self->_set_info unless ($self->{inform}->{name});
1343 return $self->{inform}->{name};
1344 }
1345
1346
1347 =head2 label
1348
1349 my $node_label = $node->label;
1350
1351 =cut
1352
1353 sub label {
1354 my $self = shift;
1355 $self->_set_info unless ($self->{inform}->{label});
1356 return $self->{inform}->{label};
1357 }
1358
1359
1360 =head2 doc_num
1361
1362 my $documents_in_node = $node->doc_num;
1363
1364 =cut
1365
1366 sub doc_num {
1367 my $self = shift;
1368 $self->_set_info if ($self->{inform}->{dnum} < 0);
1369 return $self->{inform}->{dnum};
1370 }
1371
1372
1373 =head2 word_num
1374
1375 my $words_in_node = $node->word_num;
1376
1377 =cut
1378
1379 sub word_num {
1380 my $self = shift;
1381 $self->_set_info if ($self->{inform}->{wnum} < 0);
1382 return $self->{inform}->{wnum};
1383 }
1384
1385
1386 =head2 size
1387
1388 my $node_size = $node->size;
1389
1390 =cut
1391
1392 sub size {
1393 my $self = shift;
1394 $self->_set_info if ($self->{inform}->{size} < 0);
1395 return $self->{inform}->{size};
1396 }
1397
1398
1399 =head2 search
1400
1401 Search documents which match condition
1402
1403 my $nres = $node->search( $cond, $depth );
1404
1405 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1406 depth for meta search.
1407
1408 Function results C<Search::Estraier::NodeResult> object.
1409
1410 =cut
1411
1412 sub search {
1413 my $self = shift;
1414 my ($cond, $depth) = @_;
1415 return unless ($cond && defined($depth) && $self->{url});
1416 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1417 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1418
1419 my $resbody;
1420
1421 my $rv = $self->shuttle_url( $self->{url} . '/search',
1422 'application/x-www-form-urlencoded',
1423 $self->cond_to_query( $cond, $depth ),
1424 \$resbody,
1425 );
1426 return if ($rv != 200);
1427
1428 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1429 my $hintsText = splice @records, 0, 2; # starts with empty record
1430 my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1431
1432 # process records
1433 my $docs = [];
1434 foreach my $record (@records)
1435 {
1436 # split into keys and snippets
1437 my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1438
1439 # create document hash
1440 my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1441 $doc->{'@keywords'} = $doc->{keywords};
1442 ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1443 $doc->{snippet} = $snippet;
1444
1445 push @$docs, new Search::Estraier::ResultDocument(
1446 attrs => $doc,
1447 uri => $doc->{'@uri'},
1448 snippet => $snippet,
1449 keywords => $doc->{'keywords'},
1450 );
1451 }
1452
1453 return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1454 }
1455
1456
1457 =head2 cond_to_query
1458
1459 Return URI encoded string generated from Search::Estraier::Condition
1460
1461 my $args = $node->cond_to_query( $cond, $depth );
1462
1463 =cut
1464
1465 sub cond_to_query {
1466 my $self = shift;
1467
1468 my $cond = shift || return;
1469 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1470 my $depth = shift;
1471
1472 my @args;
1473
1474 if (my $phrase = $cond->phrase) {
1475 push @args, 'phrase=' . uri_escape($phrase);
1476 }
1477
1478 if (my @attrs = $cond->attrs) {
1479 for my $i ( 0 .. $#attrs ) {
1480 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1481 }
1482 }
1483
1484 if (my $order = $cond->order) {
1485 push @args, 'order=' . uri_escape($order);
1486 }
1487
1488 if (my $max = $cond->max) {
1489 push @args, 'max=' . $max;
1490 } else {
1491 push @args, 'max=' . (1 << 30);
1492 }
1493
1494 if (my $options = $cond->options) {
1495 push @args, 'options=' . $options;
1496 }
1497
1498 push @args, 'depth=' . $depth if ($depth);
1499 push @args, 'wwidth=' . $self->{wwidth};
1500 push @args, 'hwidth=' . $self->{hwidth};
1501 push @args, 'awidth=' . $self->{awidth};
1502 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1503
1504 return join('&', @args);
1505 }
1506
1507
1508 =head2 shuttle_url
1509
1510 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1511 master.
1512
1513 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1514
1515 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1516 body will be saved within object.
1517
1518 =cut
1519
1520 use LWP::UserAgent;
1521
1522 sub shuttle_url {
1523 my $self = shift;
1524
1525 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1526
1527 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1528
1529 $self->{status} = -1;
1530
1531 warn "## $url\n" if ($self->{debug});
1532
1533 $url = new URI($url);
1534 if (
1535 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1536 !$url->host || !$url->port || $url->port < 1
1537 ) {
1538 carp "can't parse $url\n";
1539 return -1;
1540 }
1541
1542 my $ua = LWP::UserAgent->new;
1543 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1544
1545 my $req;
1546 if ($reqbody) {
1547 $req = HTTP::Request->new(POST => $url);
1548 } else {
1549 $req = HTTP::Request->new(GET => $url);
1550 }
1551
1552 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1553 $req->headers->header( 'Connection', 'close' );
1554 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1555 $req->content_type( $content_type );
1556
1557 warn $req->headers->as_string,"\n" if ($self->{debug});
1558
1559 if ($reqbody) {
1560 warn "$reqbody\n" if ($self->{debug});
1561 $req->content( $reqbody );
1562 }
1563
1564 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1565
1566 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1567
1568 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1569
1570 if (! $res->is_success) {
1571 if ($croak_on_error) {
1572 croak("can't get $url: ",$res->status_line);
1573 } else {
1574 return -1;
1575 }
1576 }
1577
1578 $$resbody .= $res->content;
1579
1580 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1581
1582 return $self->{status};
1583 }
1584
1585
1586 =head2 set_snippet_width
1587
1588 Set width of snippets in results
1589
1590 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1591
1592 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1593 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1594
1595 C<$hwidth> specified width of strings from beginning of string. Default
1596 value is C<96>. Negative or zero value keep previous value.
1597
1598 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1599 If negative of zero value is provided previous value is kept unchanged.
1600
1601 =cut
1602
1603 sub set_snippet_width {
1604 my $self = shift;
1605
1606 my ($wwidth, $hwidth, $awidth) = @_;
1607 $self->{wwidth} = $wwidth;
1608 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1609 $self->{awidth} = $awidth if ($awidth >= 0);
1610 }
1611
1612
1613 =head2 set_user
1614
1615 Manage users of node
1616
1617 $node->set_user( 'name', $mode );
1618
1619 C<$mode> can be one of:
1620
1621 =over 4
1622
1623 =item 0
1624
1625 delete account
1626
1627 =item 1
1628
1629 set administrative right for user
1630
1631 =item 2
1632
1633 set user account as guest
1634
1635 =back
1636
1637 Return true on success, otherwise false.
1638
1639 =cut
1640
1641 sub set_user {
1642 my $self = shift;
1643 my ($name, $mode) = @_;
1644
1645 return unless ($self->{url});
1646 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1647
1648 $self->shuttle_url( $self->{url} . '/_set_user',
1649 'text/plain',
1650 'name=' . uri_escape($name) . '&mode=' . $mode,
1651 undef
1652 ) == 200;
1653 }
1654
1655
1656 =head2 set_link
1657
1658 Manage node links
1659
1660 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1661
1662 If C<$credit> is negative, link is removed.
1663
1664 =cut
1665
1666 sub set_link {
1667 my $self = shift;
1668 my ($url, $label, $credit) = @_;
1669
1670 return unless ($self->{url});
1671 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1672
1673 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1674 $reqbody .= '&credit=' . $credit if ($credit > 0);
1675
1676 if ($self->shuttle_url( $self->{url} . '/_set_link',
1677 'application/x-www-form-urlencoded',
1678 $reqbody,
1679 undef
1680 ) == 200) {
1681 # refresh node info after adding link
1682 $self->_set_info;
1683 return 1;
1684 }
1685 }
1686
1687 =head2 admins
1688
1689 my @admins = @{ $node->admins };
1690
1691 Return array of users with admin rights on node
1692
1693 =cut
1694
1695 sub admins {
1696 my $self = shift;
1697 $self->_set_info unless ($self->{inform}->{name});
1698 return $self->{inform}->{admins};
1699 }
1700
1701 =head2 guests
1702
1703 my @guests = @{ $node->guests };
1704
1705 Return array of users with guest rights on node
1706
1707 =cut
1708
1709 sub guests {
1710 my $self = shift;
1711 $self->_set_info unless ($self->{inform}->{name});
1712 return $self->{inform}->{guests};
1713 }
1714
1715 =head2 links
1716
1717 my $links = @{ $node->links };
1718
1719 Return array of links for this node
1720
1721 =cut
1722
1723 sub links {
1724 my $self = shift;
1725 $self->_set_info unless ($self->{inform}->{name});
1726 return $self->{inform}->{links};
1727 }
1728
1729 =head2 master
1730
1731 Set actions on Hyper Estraier node master (C<estmaster> process)
1732
1733 $node->master(
1734 action => 'sync'
1735 );
1736
1737 All available actions are documented in
1738 L<http://hyperestraier.sourceforge.net/nguide-en.html#protocol>
1739
1740 =cut
1741
1742 my $estmaster_rest = {
1743 shutdown => {
1744 status => 202,
1745 },
1746 sync => {
1747 status => 202,
1748 },
1749 backup => {
1750 status => 202,
1751 },
1752 userlist => {
1753 status => 200,
1754 returns => [ qw/name passwd flags fname misc/ ],
1755 },
1756 useradd => {
1757 required => [ qw/name passwd flags/ ],
1758 optional => [ qw/fname misc/ ],
1759 status => 200,
1760 },
1761 userdel => {
1762 required => [ qw/name/ ],
1763 status => 200,
1764 },
1765 nodelist => {
1766 status => 200,
1767 returns => [ qw/name label doc_num word_num size/ ],
1768 },
1769 nodeadd => {
1770 required => [ qw/name/ ],
1771 optional => [ qw/label/ ],
1772 status => 200,
1773 },
1774 nodedel => {
1775 required => [ qw/name/ ],
1776 status => 200,
1777 },
1778 nodeclr => {
1779 required => [ qw/name/ ],
1780 status => 200,
1781 },
1782 nodertt => {
1783 status => 200,
1784 },
1785 };
1786
1787 sub master {
1788 my $self = shift;
1789
1790 my $args = {@_};
1791
1792 # have action?
1793 my $action = $args->{action} || croak "need action, available: ",
1794 join(", ",keys %{ $estmaster_rest });
1795
1796 # check if action is valid
1797 my $rest = $estmaster_rest->{$action};
1798 croak "action '$action' is not supported, available actions: ",
1799 join(", ",keys %{ $estmaster_rest }) unless ($rest);
1800
1801 croak "BUG: action '$action' needs return status" unless ($rest->{status});
1802
1803 my @args;
1804
1805 if ($rest->{required} || $rest->{optional}) {
1806
1807 map {
1808 croak "need parametar '$_' for action '$action'" unless ($args->{$_});
1809 push @args, $_ . '=' . uri_escape( $args->{$_} );
1810 } ( @{ $rest->{required} } );
1811
1812 map {
1813 push @args, $_ . '=' . uri_escape( $args->{$_} ) if ($args->{$_});
1814 } ( @{ $rest->{optional} } );
1815
1816 }
1817
1818 my $uri = new URI( $self->{url} );
1819
1820 my $resbody;
1821
1822 my $status = $self->shuttle_url(
1823 'http://' . $uri->host_port . '/master?action=' . $action ,
1824 'application/x-www-form-urlencoded',
1825 join('&', @args),
1826 \$resbody,
1827 1,
1828 ) or confess "shuttle_url failed";
1829
1830 if ($status == $rest->{status}) {
1831 if ($rest->{returns} && wantarray) {
1832
1833 my @results;
1834 my $fields = $#{$rest->{returns}};
1835
1836 foreach my $line ( split(/[\r\n]/,$resbody) ) {
1837 my @e = split(/\t/, $line, $fields + 1);
1838 my $row;
1839 foreach my $i ( 0 .. $fields) {
1840 $row->{ $rest->{returns}->[$i] } = $e[ $i ];
1841 }
1842 push @results, $row;
1843 }
1844
1845 return @results;
1846
1847 } elsif ($resbody) {
1848 chomp $resbody;
1849 return $resbody;
1850 } else {
1851 return 0E0;
1852 }
1853 }
1854
1855 carp "expected status $rest->{status}, but got $status";
1856 return undef;
1857 }
1858
1859 =head1 PRIVATE METHODS
1860
1861 You could call those directly, but you don't have to. I hope.
1862
1863 =head2 _set_info
1864
1865 Set information for node
1866
1867 $node->_set_info;
1868
1869 =cut
1870
1871 sub _set_info {
1872 my $self = shift;
1873
1874 $self->{status} = -1;
1875 return unless ($self->{url});
1876
1877 my $resbody;
1878 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1879 'text/plain',
1880 undef,
1881 \$resbody,
1882 );
1883
1884 return if ($rv != 200 || !$resbody);
1885
1886 my @lines = split(/[\r\n]/,$resbody);
1887
1888 $self->{inform} = {};
1889
1890 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1891 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1892
1893 return $resbody unless (@lines);
1894
1895 shift @lines;
1896
1897 while(my $admin = shift @lines) {
1898 push @{$self->{inform}->{admins}}, $admin;
1899 }
1900
1901 while(my $guest = shift @lines) {
1902 push @{$self->{inform}->{guests}}, $guest;
1903 }
1904
1905 while(my $link = shift @lines) {
1906 push @{$self->{inform}->{links}}, $link;
1907 }
1908
1909 return $resbody;
1910
1911 }
1912
1913 ###
1914
1915 =head1 EXPORT
1916
1917 Nothing.
1918
1919 =head1 SEE ALSO
1920
1921 L<http://hyperestraier.sourceforge.net/>
1922
1923 Hyper Estraier Ruby interface on which this module is based.
1924
1925 =head1 AUTHOR
1926
1927 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1928
1929 Robert Klep E<lt>robert@klep.nameE<gt> contributed refactored search code
1930
1931 =head1 COPYRIGHT AND LICENSE
1932
1933 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1934
1935 This library is free software; you can redistribute it and/or modify
1936 it under the GPL v2 or later.
1937
1938 =cut
1939
1940 1;

  ViewVC Help
Powered by ViewVC 1.1.26