/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 150 - (show annotations)
Mon May 15 22:26:08 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 34950 byte(s)
call _set_info to refresh data about node after calling out_doc*
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.07_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin',
24 create => 1,
25 label => 'Label for node',
26 croak_on_error => 1,
27 );
28
29 # create document
30 my $doc = new Search::Estraier::Document;
31
32 # add attributes
33 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
34 $doc->add_attr('@title', "Over the Rainbow");
35
36 # add body text to document
37 $doc->add_text("Somewhere over the rainbow. Way up high.");
38 $doc->add_text("There's a land that I heard of once in a lullaby.");
39
40 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
41
42 =head2 Simple searcher
43
44 use Search::Estraier;
45
46 # create and configure node
47 my $node = new Search::Estraier::Node(
48 url => 'http://localhost:1978/node/test',
49 user => 'admin',
50 passwd => 'admin',
51 croak_on_error => 1,
52 );
53
54 # create condition
55 my $cond = new Search::Estraier::Condition;
56
57 # set search phrase
58 $cond->set_phrase("rainbow AND lullaby");
59
60 my $nres = $node->search($cond, 0);
61
62 if (defined($nres)) {
63 print "Got ", $nres->hits, " results\n";
64
65 # for each document in results
66 for my $i ( 0 ... $nres->doc_num - 1 ) {
67 # get result document
68 my $rdoc = $nres->get_doc($i);
69 # display attribte
70 print "URI: ", $rdoc->attr('@uri'),"\n";
71 print "Title: ", $rdoc->attr('@title'),"\n";
72 print $rdoc->snippet,"\n";
73 }
74 } else {
75 die "error: ", $node->status,"\n";
76 }
77
78 =head1 DESCRIPTION
79
80 This module is implementation of node API of Hyper Estraier. Since it's
81 perl-only module with dependencies only on standard perl modules, it will
82 run on all platforms on which perl runs. It doesn't require compilation
83 or Hyper Estraier development files on target machine.
84
85 It is implemented as multiple packages which closly resamble Ruby
86 implementation. It also includes methods to manage nodes.
87
88 There are few examples in C<scripts> directory of this distribution.
89
90 =cut
91
92 =head1 Inheritable common methods
93
94 This methods should really move somewhere else.
95
96 =head2 _s
97
98 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
99
100 my $text = $self->_s(" this is a text ");
101 $text = 'this is a text';
102
103 =cut
104
105 sub _s {
106 my $text = $_[1];
107 return unless defined($text);
108 $text =~ s/\s\s+/ /gs;
109 $text =~ s/^\s+//;
110 $text =~ s/\s+$//;
111 return $text;
112 }
113
114 package Search::Estraier::Document;
115
116 use Carp qw/croak confess/;
117
118 use Search::Estraier;
119 our @ISA = qw/Search::Estraier/;
120
121 =head1 Search::Estraier::Document
122
123 This class implements Document which is collection of attributes
124 (key=value), vectors (also key value) display text and hidden text.
125
126
127 =head2 new
128
129 Create new document, empty or from draft.
130
131 my $doc = new Search::HyperEstraier::Document;
132 my $doc2 = new Search::HyperEstraier::Document( $draft );
133
134 =cut
135
136 sub new {
137 my $class = shift;
138 my $self = {};
139 bless($self, $class);
140
141 $self->{id} = -1;
142
143 my $draft = shift;
144
145 if ($draft) {
146 my $in_text = 0;
147 foreach my $line (split(/\n/, $draft)) {
148
149 if ($in_text) {
150 if ($line =~ /^\t/) {
151 push @{ $self->{htexts} }, substr($line, 1);
152 } else {
153 push @{ $self->{dtexts} }, $line;
154 }
155 next;
156 }
157
158 if ($line =~ m/^%VECTOR\t(.+)$/) {
159 my @fields = split(/\t/, $1);
160 for my $i ( 0 .. ($#fields - 1) ) {
161 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
162 $i++;
163 }
164 next;
165 } elsif ($line =~ m/^%/) {
166 # What is this? comment?
167 #warn "$line\n";
168 next;
169 } elsif ($line =~ m/^$/) {
170 $in_text = 1;
171 next;
172 } elsif ($line =~ m/^(.+)=(.*)$/) {
173 $self->{attrs}->{ $1 } = $2;
174 next;
175 }
176
177 warn "draft ignored: '$line'\n";
178 }
179 }
180
181 $self ? return $self : return undef;
182 }
183
184
185 =head2 add_attr
186
187 Add an attribute.
188
189 $doc->add_attr( name => 'value' );
190
191 Delete attribute using
192
193 $doc->add_attr( name => undef );
194
195 =cut
196
197 sub add_attr {
198 my $self = shift;
199 my $attrs = {@_};
200
201 while (my ($name, $value) = each %{ $attrs }) {
202 if (! defined($value)) {
203 delete( $self->{attrs}->{ $self->_s($name) } );
204 } else {
205 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
206 }
207 }
208
209 return 1;
210 }
211
212
213 =head2 add_text
214
215 Add a sentence of text.
216
217 $doc->add_text('this is example text to display');
218
219 =cut
220
221 sub add_text {
222 my $self = shift;
223 my $text = shift;
224 return unless defined($text);
225
226 push @{ $self->{dtexts} }, $self->_s($text);
227 }
228
229
230 =head2 add_hidden_text
231
232 Add a hidden sentence.
233
234 $doc->add_hidden_text('this is example text just for search');
235
236 =cut
237
238 sub add_hidden_text {
239 my $self = shift;
240 my $text = shift;
241 return unless defined($text);
242
243 push @{ $self->{htexts} }, $self->_s($text);
244 }
245
246
247 =head2 id
248
249 Get the ID number of document. If the object has never been registred, C<-1> is returned.
250
251 print $doc->id;
252
253 =cut
254
255 sub id {
256 my $self = shift;
257 return $self->{id};
258 }
259
260
261 =head2 attr_names
262
263 Returns array with attribute names from document object.
264
265 my @attrs = $doc->attr_names;
266
267 =cut
268
269 sub attr_names {
270 my $self = shift;
271 return unless ($self->{attrs});
272 #croak "attr_names return array, not scalar" if (! wantarray);
273 return sort keys %{ $self->{attrs} };
274 }
275
276
277 =head2 attr
278
279 Returns value of an attribute.
280
281 my $value = $doc->attr( 'attribute' );
282
283 =cut
284
285 sub attr {
286 my $self = shift;
287 my $name = shift;
288 return unless (defined($name) && $self->{attrs});
289 return $self->{attrs}->{ $name };
290 }
291
292
293 =head2 texts
294
295 Returns array with text sentences.
296
297 my @texts = $doc->texts;
298
299 =cut
300
301 sub texts {
302 my $self = shift;
303 #confess "texts return array, not scalar" if (! wantarray);
304 return @{ $self->{dtexts} } if ($self->{dtexts});
305 }
306
307
308 =head2 cat_texts
309
310 Return whole text as single scalar.
311
312 my $text = $doc->cat_texts;
313
314 =cut
315
316 sub cat_texts {
317 my $self = shift;
318 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
319 }
320
321
322 =head2 dump_draft
323
324 Dump draft data from document object.
325
326 print $doc->dump_draft;
327
328 =cut
329
330 sub dump_draft {
331 my $self = shift;
332 my $draft;
333
334 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
335 next unless defined(my $v = $self->{attrs}->{$attr_name});
336 $draft .= $attr_name . '=' . $v . "\n";
337 }
338
339 if ($self->{kwords}) {
340 $draft .= '%%VECTOR';
341 while (my ($key, $value) = each %{ $self->{kwords} }) {
342 $draft .= "\t$key\t$value";
343 }
344 $draft .= "\n";
345 }
346
347 $draft .= "\n";
348
349 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
350 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
351
352 return $draft;
353 }
354
355
356 =head2 delete
357
358 Empty document object
359
360 $doc->delete;
361
362 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
363 convinience. Document objects which go out of scope will be destroyed
364 automatically.
365
366 =cut
367
368 sub delete {
369 my $self = shift;
370
371 foreach my $data (qw/attrs dtexts stexts kwords/) {
372 delete($self->{$data});
373 }
374
375 $self->{id} = -1;
376
377 return 1;
378 }
379
380
381
382 package Search::Estraier::Condition;
383
384 use Carp qw/carp confess croak/;
385
386 use Search::Estraier;
387 our @ISA = qw/Search::Estraier/;
388
389 =head1 Search::Estraier::Condition
390
391 =head2 new
392
393 my $cond = new Search::HyperEstraier::Condition;
394
395 =cut
396
397 sub new {
398 my $class = shift;
399 my $self = {};
400 bless($self, $class);
401
402 $self->{max} = -1;
403 $self->{options} = 0;
404
405 $self ? return $self : return undef;
406 }
407
408
409 =head2 set_phrase
410
411 $cond->set_phrase('search phrase');
412
413 =cut
414
415 sub set_phrase {
416 my $self = shift;
417 $self->{phrase} = $self->_s( shift );
418 }
419
420
421 =head2 add_attr
422
423 $cond->add_attr('@URI STRINC /~dpavlin/');
424
425 =cut
426
427 sub add_attr {
428 my $self = shift;
429 my $attr = shift || return;
430 push @{ $self->{attrs} }, $self->_s( $attr );
431 }
432
433
434 =head2 set_order
435
436 $cond->set_order('@mdate NUMD');
437
438 =cut
439
440 sub set_order {
441 my $self = shift;
442 $self->{order} = shift;
443 }
444
445
446 =head2 set_max
447
448 $cond->set_max(42);
449
450 =cut
451
452 sub set_max {
453 my $self = shift;
454 my $max = shift;
455 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
456 $self->{max} = $max;
457 }
458
459
460 =head2 set_options
461
462 $cond->set_options( 'SURE' );
463
464 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
465
466 Possible options are:
467
468 =over 8
469
470 =item SURE
471
472 check every N-gram
473
474 =item USUAL
475
476 check every second N-gram
477
478 =item FAST
479
480 check every third N-gram
481
482 =item AGITO
483
484 check every fourth N-gram
485
486 =item NOIDF
487
488 don't perform TF-IDF tuning
489
490 =item SIMPLE
491
492 use simplified query phrase
493
494 =back
495
496 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
497 options;
498
499 This option changed in version C<0.04> of this module. It's backwards compatibile.
500
501 =cut
502
503 my $options = {
504 SURE => 1 << 0,
505 USUAL => 1 << 1,
506 FAST => 1 << 2,
507 AGITO => 1 << 3,
508 NOIDF => 1 << 4,
509 SIMPLE => 1 << 10,
510 };
511
512 sub set_options {
513 my $self = shift;
514 my $opt = 0;
515 foreach my $option (@_) {
516 my $mask;
517 unless ($mask = $options->{$option}) {
518 if ($option eq '1') {
519 next;
520 } else {
521 croak "unknown option $option";
522 }
523 }
524 $opt += $mask;
525 }
526 $self->{options} = $opt;
527 }
528
529
530 =head2 phrase
531
532 Return search phrase.
533
534 print $cond->phrase;
535
536 =cut
537
538 sub phrase {
539 my $self = shift;
540 return $self->{phrase};
541 }
542
543
544 =head2 order
545
546 Return search result order.
547
548 print $cond->order;
549
550 =cut
551
552 sub order {
553 my $self = shift;
554 return $self->{order};
555 }
556
557
558 =head2 attrs
559
560 Return search result attrs.
561
562 my @cond_attrs = $cond->attrs;
563
564 =cut
565
566 sub attrs {
567 my $self = shift;
568 #croak "attrs return array, not scalar" if (! wantarray);
569 return @{ $self->{attrs} } if ($self->{attrs});
570 }
571
572
573 =head2 max
574
575 Return maximum number of results.
576
577 print $cond->max;
578
579 C<-1> is returned for unitialized value, C<0> is unlimited.
580
581 =cut
582
583 sub max {
584 my $self = shift;
585 return $self->{max};
586 }
587
588
589 =head2 options
590
591 Return options for this condition.
592
593 print $cond->options;
594
595 Options are returned in numerical form.
596
597 =cut
598
599 sub options {
600 my $self = shift;
601 return $self->{options};
602 }
603
604
605 =head2 set_skip
606
607 Set number of skipped documents from beginning of results
608
609 $cond->set_skip(42);
610
611 Similar to C<offset> in RDBMS.
612
613 =cut
614
615 sub set_skip {
616 my $self = shift;
617 $self->{skip} = shift;
618 }
619
620 =head2 skip
621
622 Return skip for this condition.
623
624 print $cond->skip;
625
626 =cut
627
628 sub skip {
629 my $self = shift;
630 return $self->{skip};
631 }
632
633
634 package Search::Estraier::ResultDocument;
635
636 use Carp qw/croak/;
637
638 #use Search::Estraier;
639 #our @ISA = qw/Search::Estraier/;
640
641 =head1 Search::Estraier::ResultDocument
642
643 =head2 new
644
645 my $rdoc = new Search::HyperEstraier::ResultDocument(
646 uri => 'http://localhost/document/uri/42',
647 attrs => {
648 foo => 1,
649 bar => 2,
650 },
651 snippet => 'this is a text of snippet'
652 keywords => 'this\tare\tkeywords'
653 );
654
655 =cut
656
657 sub new {
658 my $class = shift;
659 my $self = {@_};
660 bless($self, $class);
661
662 croak "missing uri for ResultDocument" unless defined($self->{uri});
663
664 $self ? return $self : return undef;
665 }
666
667
668 =head2 uri
669
670 Return URI of result document
671
672 print $rdoc->uri;
673
674 =cut
675
676 sub uri {
677 my $self = shift;
678 return $self->{uri};
679 }
680
681
682 =head2 attr_names
683
684 Returns array with attribute names from result document object.
685
686 my @attrs = $rdoc->attr_names;
687
688 =cut
689
690 sub attr_names {
691 my $self = shift;
692 croak "attr_names return array, not scalar" if (! wantarray);
693 return sort keys %{ $self->{attrs} };
694 }
695
696
697 =head2 attr
698
699 Returns value of an attribute.
700
701 my $value = $rdoc->attr( 'attribute' );
702
703 =cut
704
705 sub attr {
706 my $self = shift;
707 my $name = shift || return;
708 return $self->{attrs}->{ $name };
709 }
710
711
712 =head2 snippet
713
714 Return snippet from result document
715
716 print $rdoc->snippet;
717
718 =cut
719
720 sub snippet {
721 my $self = shift;
722 return $self->{snippet};
723 }
724
725
726 =head2 keywords
727
728 Return keywords from result document
729
730 print $rdoc->keywords;
731
732 =cut
733
734 sub keywords {
735 my $self = shift;
736 return $self->{keywords};
737 }
738
739
740 package Search::Estraier::NodeResult;
741
742 use Carp qw/croak/;
743
744 #use Search::Estraier;
745 #our @ISA = qw/Search::Estraier/;
746
747 =head1 Search::Estraier::NodeResult
748
749 =head2 new
750
751 my $res = new Search::HyperEstraier::NodeResult(
752 docs => @array_of_rdocs,
753 hits => %hash_with_hints,
754 );
755
756 =cut
757
758 sub new {
759 my $class = shift;
760 my $self = {@_};
761 bless($self, $class);
762
763 foreach my $f (qw/docs hints/) {
764 croak "missing $f for ResultDocument" unless defined($self->{$f});
765 }
766
767 $self ? return $self : return undef;
768 }
769
770
771 =head2 doc_num
772
773 Return number of documents
774
775 print $res->doc_num;
776
777 This will return real number of documents (limited by C<max>).
778 If you want to get total number of hits, see C<hits>.
779
780 =cut
781
782 sub doc_num {
783 my $self = shift;
784 return $#{$self->{docs}} + 1;
785 }
786
787
788 =head2 get_doc
789
790 Return single document
791
792 my $doc = $res->get_doc( 42 );
793
794 Returns undef if document doesn't exist.
795
796 =cut
797
798 sub get_doc {
799 my $self = shift;
800 my $num = shift;
801 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
802 return undef if ($num < 0 || $num > $self->{docs});
803 return $self->{docs}->[$num];
804 }
805
806
807 =head2 hint
808
809 Return specific hint from results.
810
811 print $res->hint( 'VERSION' );
812
813 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
814 C<TIME>, C<LINK#n>, C<VIEW>.
815
816 =cut
817
818 sub hint {
819 my $self = shift;
820 my $key = shift || return;
821 return $self->{hints}->{$key};
822 }
823
824 =head2 hints
825
826 More perlish version of C<hint>. This one returns hash.
827
828 my %hints = $res->hints;
829
830 =cut
831
832 sub hints {
833 my $self = shift;
834 return $self->{hints};
835 }
836
837 =head2 hits
838
839 Syntaxtic sugar for total number of hits for this query
840
841 print $res->hits;
842
843 It's same as
844
845 print $res->hint('HIT');
846
847 but shorter.
848
849 =cut
850
851 sub hits {
852 my $self = shift;
853 return $self->{hints}->{'HIT'} || 0;
854 }
855
856 package Search::Estraier::Node;
857
858 use Carp qw/carp croak confess/;
859 use URI;
860 use MIME::Base64;
861 use IO::Socket::INET;
862 use URI::Escape qw/uri_escape/;
863
864 =head1 Search::Estraier::Node
865
866 =head2 new
867
868 my $node = new Search::HyperEstraier::Node;
869
870 or optionally with C<url> as parametar
871
872 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
873
874 or in more verbose form
875
876 my $node = new Search::HyperEstraier::Node(
877 url => 'http://localhost:1978/node/test',
878 user => 'admin',
879 passwd => 'admin'
880 create => 1,
881 label => 'optional node label',
882 debug => 1,
883 croak_on_error => 1
884 );
885
886 with following arguments:
887
888 =over 4
889
890 =item url
891
892 URL to node
893
894 =item user
895
896 specify username for node server authentication
897
898 =item passwd
899
900 password for authentication
901
902 =item create
903
904 create node if it doesn't exists
905
906 =item label
907
908 optional label for new node if C<create> is used
909
910 =item debug
911
912 dumps a B<lot> of debugging output
913
914 =item croak_on_error
915
916 very helpful during development. It will croak on all errors instead of
917 silently returning C<-1> (which is convention of Hyper Estraier API in other
918 languages).
919
920 =back
921
922 =cut
923
924 sub new {
925 my $class = shift;
926 my $self = {
927 pxport => -1,
928 timeout => 0, # this used to be -1
929 wwidth => 480,
930 hwidth => 96,
931 awidth => 96,
932 status => -1,
933 };
934
935 bless($self, $class);
936
937 if ($#_ == 0) {
938 $self->{url} = shift;
939 } else {
940 %$self = ( %$self, @_ );
941
942 $self->set_auth( $self->{user}, $self->{passwd} ) if ($self->{user});
943
944 warn "## Node debug on\n" if ($self->{debug});
945 }
946
947 $self->{inform} = {
948 dnum => -1,
949 wnum => -1,
950 size => -1.0,
951 };
952
953 if ($self->{create}) {
954 if (! eval { $self->name } || $@) {
955 my $name = $1 if ($self->{url} =~ m#/node/([^/]+)/*#);
956 croak "can't find node name in '$self->{url}'" unless ($name);
957 my $label = $self->{label} || $name;
958 $self->master(
959 action => 'nodeadd',
960 name => $name,
961 label => $label,
962 ) || croak "can't create node $name ($label)";
963 }
964 }
965
966 $self ? return $self : return undef;
967 }
968
969
970 =head2 set_url
971
972 Specify URL to node server
973
974 $node->set_url('http://localhost:1978');
975
976 =cut
977
978 sub set_url {
979 my $self = shift;
980 $self->{url} = shift;
981 }
982
983
984 =head2 set_proxy
985
986 Specify proxy server to connect to node server
987
988 $node->set_proxy('proxy.example.com', 8080);
989
990 =cut
991
992 sub set_proxy {
993 my $self = shift;
994 my ($host,$port) = @_;
995 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
996 $self->{pxhost} = $host;
997 $self->{pxport} = $port;
998 }
999
1000
1001 =head2 set_timeout
1002
1003 Specify timeout of connection in seconds
1004
1005 $node->set_timeout( 15 );
1006
1007 =cut
1008
1009 sub set_timeout {
1010 my $self = shift;
1011 my $sec = shift;
1012 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
1013 $self->{timeout} = $sec;
1014 }
1015
1016
1017 =head2 set_auth
1018
1019 Specify name and password for authentication to node server.
1020
1021 $node->set_auth('clint','eastwood');
1022
1023 =cut
1024
1025 sub set_auth {
1026 my $self = shift;
1027 my ($login,$passwd) = @_;
1028 my $basic_auth = encode_base64( "$login:$passwd" );
1029 chomp($basic_auth);
1030 $self->{auth} = $basic_auth;
1031 }
1032
1033
1034 =head2 status
1035
1036 Return status code of last request.
1037
1038 print $node->status;
1039
1040 C<-1> means connection failure.
1041
1042 =cut
1043
1044 sub status {
1045 my $self = shift;
1046 return $self->{status};
1047 }
1048
1049
1050 =head2 put_doc
1051
1052 Add a document
1053
1054 $node->put_doc( $document_draft ) or die "can't add document";
1055
1056 Return true on success or false on failture.
1057
1058 =cut
1059
1060 sub put_doc {
1061 my $self = shift;
1062 my $doc = shift || return;
1063 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1064 $self->shuttle_url( $self->{url} . '/put_doc',
1065 'text/x-estraier-draft',
1066 $doc->dump_draft,
1067 undef
1068 ) == 200;
1069 }
1070
1071
1072 =head2 out_doc
1073
1074 Remove a document
1075
1076 $node->out_doc( document_id ) or "can't remove document";
1077
1078 Return true on success or false on failture.
1079
1080 =cut
1081
1082 sub out_doc {
1083 my $self = shift;
1084 my $id = shift || return;
1085 return unless ($self->{url});
1086 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1087 if ($self->shuttle_url( $self->{url} . '/out_doc',
1088 'application/x-www-form-urlencoded',
1089 "id=$id",
1090 undef
1091 ) == 200) {
1092 $self->_set_info;
1093 return $id;
1094 }
1095 return undef;
1096 }
1097
1098
1099 =head2 out_doc_by_uri
1100
1101 Remove a registrated document using it's uri
1102
1103 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1104
1105 Return true on success or false on failture.
1106
1107 =cut
1108
1109 sub out_doc_by_uri {
1110 my $self = shift;
1111 my $uri = shift || return;
1112 return unless ($self->{url});
1113 if ($self->shuttle_url( $self->{url} . '/out_doc',
1114 'application/x-www-form-urlencoded',
1115 "uri=" . uri_escape($uri),
1116 undef
1117 ) == 200) {
1118 $self->_set_info;
1119 return $uri;
1120 }
1121 return undef;
1122 }
1123
1124
1125 =head2 edit_doc
1126
1127 Edit attributes of a document
1128
1129 $node->edit_doc( $document_draft ) or die "can't edit document";
1130
1131 Return true on success or false on failture.
1132
1133 =cut
1134
1135 sub edit_doc {
1136 my $self = shift;
1137 my $doc = shift || return;
1138 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1139 $self->shuttle_url( $self->{url} . '/edit_doc',
1140 'text/x-estraier-draft',
1141 $doc->dump_draft,
1142 undef
1143 ) == 200;
1144 }
1145
1146
1147 =head2 get_doc
1148
1149 Retreive document
1150
1151 my $doc = $node->get_doc( document_id ) or die "can't get document";
1152
1153 Return true on success or false on failture.
1154
1155 =cut
1156
1157 sub get_doc {
1158 my $self = shift;
1159 my $id = shift || return;
1160 return $self->_fetch_doc( id => $id );
1161 }
1162
1163
1164 =head2 get_doc_by_uri
1165
1166 Retreive document
1167
1168 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1169
1170 Return true on success or false on failture.
1171
1172 =cut
1173
1174 sub get_doc_by_uri {
1175 my $self = shift;
1176 my $uri = shift || return;
1177 return $self->_fetch_doc( uri => $uri );
1178 }
1179
1180
1181 =head2 get_doc_attr
1182
1183 Retrieve the value of an atribute from object
1184
1185 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1186 die "can't get document attribute";
1187
1188 =cut
1189
1190 sub get_doc_attr {
1191 my $self = shift;
1192 my ($id,$name) = @_;
1193 return unless ($id && $name);
1194 return $self->_fetch_doc( id => $id, attr => $name );
1195 }
1196
1197
1198 =head2 get_doc_attr_by_uri
1199
1200 Retrieve the value of an atribute from object
1201
1202 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1203 die "can't get document attribute";
1204
1205 =cut
1206
1207 sub get_doc_attr_by_uri {
1208 my $self = shift;
1209 my ($uri,$name) = @_;
1210 return unless ($uri && $name);
1211 return $self->_fetch_doc( uri => $uri, attr => $name );
1212 }
1213
1214
1215 =head2 etch_doc
1216
1217 Exctract document keywords
1218
1219 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1220
1221 =cut
1222
1223 sub etch_doc {
1224 my $self = shift;
1225 my $id = shift || return;
1226 return $self->_fetch_doc( id => $id, etch => 1 );
1227 }
1228
1229 =head2 etch_doc_by_uri
1230
1231 Retreive document
1232
1233 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1234
1235 Return true on success or false on failture.
1236
1237 =cut
1238
1239 sub etch_doc_by_uri {
1240 my $self = shift;
1241 my $uri = shift || return;
1242 return $self->_fetch_doc( uri => $uri, etch => 1 );
1243 }
1244
1245
1246 =head2 uri_to_id
1247
1248 Get ID of document specified by URI
1249
1250 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1251
1252 This method won't croak, even if using C<croak_on_error>.
1253
1254 =cut
1255
1256 sub uri_to_id {
1257 my $self = shift;
1258 my $uri = shift || return;
1259 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1260 }
1261
1262
1263 =head2 _fetch_doc
1264
1265 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1266 C<etch_doc>, C<etch_doc_by_uri>.
1267
1268 # this will decode received draft into Search::Estraier::Document object
1269 my $doc = $node->_fetch_doc( id => 42 );
1270 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1271
1272 # to extract keywords, add etch
1273 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1274 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1275
1276 # to get document attrubute add attr
1277 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1278 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1279
1280 # more general form which allows implementation of
1281 # uri_to_id
1282 my $id = $node->_fetch_doc(
1283 uri => 'file:///document/uri/42',
1284 path => '/uri_to_id',
1285 chomp_resbody => 1
1286 );
1287
1288 =cut
1289
1290 sub _fetch_doc {
1291 my $self = shift;
1292 my $a = {@_};
1293 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1294
1295 my ($arg, $resbody);
1296
1297 my $path = $a->{path} || '/get_doc';
1298 $path = '/etch_doc' if ($a->{etch});
1299
1300 if ($a->{id}) {
1301 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1302 $arg = 'id=' . $a->{id};
1303 } elsif ($a->{uri}) {
1304 $arg = 'uri=' . uri_escape($a->{uri});
1305 } else {
1306 confess "unhandled argument. Need id or uri.";
1307 }
1308
1309 if ($a->{attr}) {
1310 $path = '/get_doc_attr';
1311 $arg .= '&attr=' . uri_escape($a->{attr});
1312 $a->{chomp_resbody} = 1;
1313 }
1314
1315 my $rv = $self->shuttle_url( $self->{url} . $path,
1316 'application/x-www-form-urlencoded',
1317 $arg,
1318 \$resbody,
1319 $a->{croak_on_error},
1320 );
1321
1322 return if ($rv != 200);
1323
1324 if ($a->{etch}) {
1325 $self->{kwords} = {};
1326 return +{} unless ($resbody);
1327 foreach my $l (split(/\n/, $resbody)) {
1328 my ($k,$v) = split(/\t/, $l, 2);
1329 $self->{kwords}->{$k} = $v if ($v);
1330 }
1331 return $self->{kwords};
1332 } elsif ($a->{chomp_resbody}) {
1333 return unless (defined($resbody));
1334 chomp($resbody);
1335 return $resbody;
1336 } else {
1337 return new Search::Estraier::Document($resbody);
1338 }
1339 }
1340
1341
1342 =head2 name
1343
1344 my $node_name = $node->name;
1345
1346 =cut
1347
1348 sub name {
1349 my $self = shift;
1350 $self->_set_info unless ($self->{inform}->{name});
1351 return $self->{inform}->{name};
1352 }
1353
1354
1355 =head2 label
1356
1357 my $node_label = $node->label;
1358
1359 =cut
1360
1361 sub label {
1362 my $self = shift;
1363 $self->_set_info unless ($self->{inform}->{label});
1364 return $self->{inform}->{label};
1365 }
1366
1367
1368 =head2 doc_num
1369
1370 my $documents_in_node = $node->doc_num;
1371
1372 =cut
1373
1374 sub doc_num {
1375 my $self = shift;
1376 $self->_set_info if ($self->{inform}->{dnum} < 0);
1377 return $self->{inform}->{dnum};
1378 }
1379
1380
1381 =head2 word_num
1382
1383 my $words_in_node = $node->word_num;
1384
1385 =cut
1386
1387 sub word_num {
1388 my $self = shift;
1389 $self->_set_info if ($self->{inform}->{wnum} < 0);
1390 return $self->{inform}->{wnum};
1391 }
1392
1393
1394 =head2 size
1395
1396 my $node_size = $node->size;
1397
1398 =cut
1399
1400 sub size {
1401 my $self = shift;
1402 $self->_set_info if ($self->{inform}->{size} < 0);
1403 return $self->{inform}->{size};
1404 }
1405
1406
1407 =head2 search
1408
1409 Search documents which match condition
1410
1411 my $nres = $node->search( $cond, $depth );
1412
1413 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1414 depth for meta search.
1415
1416 Function results C<Search::Estraier::NodeResult> object.
1417
1418 =cut
1419
1420 sub search {
1421 my $self = shift;
1422 my ($cond, $depth) = @_;
1423 return unless ($cond && defined($depth) && $self->{url});
1424 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1425 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1426
1427 my $resbody;
1428
1429 my $rv = $self->shuttle_url( $self->{url} . '/search',
1430 'application/x-www-form-urlencoded',
1431 $self->cond_to_query( $cond, $depth ),
1432 \$resbody,
1433 );
1434 return if ($rv != 200);
1435
1436 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1437 my $hintsText = splice @records, 0, 2; # starts with empty record
1438 my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1439
1440 # process records
1441 my $docs = [];
1442 foreach my $record (@records)
1443 {
1444 # split into keys and snippets
1445 my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1446
1447 # create document hash
1448 my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1449 $doc->{'@keywords'} = $doc->{keywords};
1450 ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1451 $doc->{snippet} = $snippet;
1452
1453 push @$docs, new Search::Estraier::ResultDocument(
1454 attrs => $doc,
1455 uri => $doc->{'@uri'},
1456 snippet => $snippet,
1457 keywords => $doc->{'keywords'},
1458 );
1459 }
1460
1461 return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1462 }
1463
1464
1465 =head2 cond_to_query
1466
1467 Return URI encoded string generated from Search::Estraier::Condition
1468
1469 my $args = $node->cond_to_query( $cond, $depth );
1470
1471 =cut
1472
1473 sub cond_to_query {
1474 my $self = shift;
1475
1476 my $cond = shift || return;
1477 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1478 my $depth = shift;
1479
1480 my @args;
1481
1482 if (my $phrase = $cond->phrase) {
1483 push @args, 'phrase=' . uri_escape($phrase);
1484 }
1485
1486 if (my @attrs = $cond->attrs) {
1487 for my $i ( 0 .. $#attrs ) {
1488 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1489 }
1490 }
1491
1492 if (my $order = $cond->order) {
1493 push @args, 'order=' . uri_escape($order);
1494 }
1495
1496 if (my $max = $cond->max) {
1497 push @args, 'max=' . $max;
1498 } else {
1499 push @args, 'max=' . (1 << 30);
1500 }
1501
1502 if (my $options = $cond->options) {
1503 push @args, 'options=' . $options;
1504 }
1505
1506 push @args, 'depth=' . $depth if ($depth);
1507 push @args, 'wwidth=' . $self->{wwidth};
1508 push @args, 'hwidth=' . $self->{hwidth};
1509 push @args, 'awidth=' . $self->{awidth};
1510 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1511
1512 return join('&', @args);
1513 }
1514
1515
1516 =head2 shuttle_url
1517
1518 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1519 master.
1520
1521 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1522
1523 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1524 body will be saved within object.
1525
1526 =cut
1527
1528 use LWP::UserAgent;
1529
1530 sub shuttle_url {
1531 my $self = shift;
1532
1533 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1534
1535 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1536
1537 $self->{status} = -1;
1538
1539 warn "## $url\n" if ($self->{debug});
1540
1541 $url = new URI($url);
1542 if (
1543 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1544 !$url->host || !$url->port || $url->port < 1
1545 ) {
1546 carp "can't parse $url\n";
1547 return -1;
1548 }
1549
1550 my $ua = LWP::UserAgent->new;
1551 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1552
1553 my $req;
1554 if ($reqbody) {
1555 $req = HTTP::Request->new(POST => $url);
1556 } else {
1557 $req = HTTP::Request->new(GET => $url);
1558 }
1559
1560 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1561 $req->headers->header( 'Connection', 'close' );
1562 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1563 $req->content_type( $content_type );
1564
1565 warn $req->headers->as_string,"\n" if ($self->{debug});
1566
1567 if ($reqbody) {
1568 warn "$reqbody\n" if ($self->{debug});
1569 $req->content( $reqbody );
1570 }
1571
1572 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1573
1574 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1575
1576 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1577
1578 if (! $res->is_success) {
1579 if ($croak_on_error) {
1580 croak("can't get $url: ",$res->status_line);
1581 } else {
1582 return -1;
1583 }
1584 }
1585
1586 $$resbody .= $res->content;
1587
1588 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1589
1590 return $self->{status};
1591 }
1592
1593
1594 =head2 set_snippet_width
1595
1596 Set width of snippets in results
1597
1598 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1599
1600 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1601 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1602
1603 C<$hwidth> specified width of strings from beginning of string. Default
1604 value is C<96>. Negative or zero value keep previous value.
1605
1606 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1607 If negative of zero value is provided previous value is kept unchanged.
1608
1609 =cut
1610
1611 sub set_snippet_width {
1612 my $self = shift;
1613
1614 my ($wwidth, $hwidth, $awidth) = @_;
1615 $self->{wwidth} = $wwidth;
1616 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1617 $self->{awidth} = $awidth if ($awidth >= 0);
1618 }
1619
1620
1621 =head2 set_user
1622
1623 Manage users of node
1624
1625 $node->set_user( 'name', $mode );
1626
1627 C<$mode> can be one of:
1628
1629 =over 4
1630
1631 =item 0
1632
1633 delete account
1634
1635 =item 1
1636
1637 set administrative right for user
1638
1639 =item 2
1640
1641 set user account as guest
1642
1643 =back
1644
1645 Return true on success, otherwise false.
1646
1647 =cut
1648
1649 sub set_user {
1650 my $self = shift;
1651 my ($name, $mode) = @_;
1652
1653 return unless ($self->{url});
1654 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1655
1656 $self->shuttle_url( $self->{url} . '/_set_user',
1657 'text/plain',
1658 'name=' . uri_escape($name) . '&mode=' . $mode,
1659 undef
1660 ) == 200;
1661 }
1662
1663
1664 =head2 set_link
1665
1666 Manage node links
1667
1668 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1669
1670 If C<$credit> is negative, link is removed.
1671
1672 =cut
1673
1674 sub set_link {
1675 my $self = shift;
1676 my ($url, $label, $credit) = @_;
1677
1678 return unless ($self->{url});
1679 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1680
1681 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1682 $reqbody .= '&credit=' . $credit if ($credit > 0);
1683
1684 if ($self->shuttle_url( $self->{url} . '/_set_link',
1685 'application/x-www-form-urlencoded',
1686 $reqbody,
1687 undef
1688 ) == 200) {
1689 # refresh node info after adding link
1690 $self->_set_info;
1691 return 1;
1692 }
1693 return undef;
1694 }
1695
1696 =head2 admins
1697
1698 my @admins = @{ $node->admins };
1699
1700 Return array of users with admin rights on node
1701
1702 =cut
1703
1704 sub admins {
1705 my $self = shift;
1706 $self->_set_info unless ($self->{inform}->{name});
1707 return $self->{inform}->{admins};
1708 }
1709
1710 =head2 guests
1711
1712 my @guests = @{ $node->guests };
1713
1714 Return array of users with guest rights on node
1715
1716 =cut
1717
1718 sub guests {
1719 my $self = shift;
1720 $self->_set_info unless ($self->{inform}->{name});
1721 return $self->{inform}->{guests};
1722 }
1723
1724 =head2 links
1725
1726 my $links = @{ $node->links };
1727
1728 Return array of links for this node
1729
1730 =cut
1731
1732 sub links {
1733 my $self = shift;
1734 $self->_set_info unless ($self->{inform}->{name});
1735 return $self->{inform}->{links};
1736 }
1737
1738 =head2 master
1739
1740 Set actions on Hyper Estraier node master (C<estmaster> process)
1741
1742 $node->master(
1743 action => 'sync'
1744 );
1745
1746 All available actions are documented in
1747 L<http://hyperestraier.sourceforge.net/nguide-en.html#protocol>
1748
1749 =cut
1750
1751 my $estmaster_rest = {
1752 shutdown => {
1753 status => 202,
1754 },
1755 sync => {
1756 status => 202,
1757 },
1758 backup => {
1759 status => 202,
1760 },
1761 userlist => {
1762 status => 200,
1763 returns => [ qw/name passwd flags fname misc/ ],
1764 },
1765 useradd => {
1766 required => [ qw/name passwd flags/ ],
1767 optional => [ qw/fname misc/ ],
1768 status => 200,
1769 },
1770 userdel => {
1771 required => [ qw/name/ ],
1772 status => 200,
1773 },
1774 nodelist => {
1775 status => 200,
1776 returns => [ qw/name label doc_num word_num size/ ],
1777 },
1778 nodeadd => {
1779 required => [ qw/name/ ],
1780 optional => [ qw/label/ ],
1781 status => 200,
1782 },
1783 nodedel => {
1784 required => [ qw/name/ ],
1785 status => 200,
1786 },
1787 nodeclr => {
1788 required => [ qw/name/ ],
1789 status => 200,
1790 },
1791 nodertt => {
1792 status => 200,
1793 },
1794 };
1795
1796 sub master {
1797 my $self = shift;
1798
1799 my $args = {@_};
1800
1801 # have action?
1802 my $action = $args->{action} || croak "need action, available: ",
1803 join(", ",keys %{ $estmaster_rest });
1804
1805 # check if action is valid
1806 my $rest = $estmaster_rest->{$action};
1807 croak "action '$action' is not supported, available actions: ",
1808 join(", ",keys %{ $estmaster_rest }) unless ($rest);
1809
1810 croak "BUG: action '$action' needs return status" unless ($rest->{status});
1811
1812 my @args;
1813
1814 if ($rest->{required} || $rest->{optional}) {
1815
1816 map {
1817 croak "need parametar '$_' for action '$action'" unless ($args->{$_});
1818 push @args, $_ . '=' . uri_escape( $args->{$_} );
1819 } ( @{ $rest->{required} } );
1820
1821 map {
1822 push @args, $_ . '=' . uri_escape( $args->{$_} ) if ($args->{$_});
1823 } ( @{ $rest->{optional} } );
1824
1825 }
1826
1827 my $uri = new URI( $self->{url} );
1828
1829 my $resbody;
1830
1831 my $status = $self->shuttle_url(
1832 'http://' . $uri->host_port . '/master?action=' . $action ,
1833 'application/x-www-form-urlencoded',
1834 join('&', @args),
1835 \$resbody,
1836 1,
1837 ) or confess "shuttle_url failed";
1838
1839 if ($status == $rest->{status}) {
1840
1841 # refresh node info after sync
1842 $self->_set_info if ($action eq 'sync');
1843
1844 if ($rest->{returns} && wantarray) {
1845
1846 my @results;
1847 my $fields = $#{$rest->{returns}};
1848
1849 foreach my $line ( split(/[\r\n]/,$resbody) ) {
1850 my @e = split(/\t/, $line, $fields + 1);
1851 my $row;
1852 foreach my $i ( 0 .. $fields) {
1853 $row->{ $rest->{returns}->[$i] } = $e[ $i ];
1854 }
1855 push @results, $row;
1856 }
1857
1858 return @results;
1859
1860 } elsif ($resbody) {
1861 chomp $resbody;
1862 return $resbody;
1863 } else {
1864 return 0E0;
1865 }
1866 }
1867
1868 carp "expected status $rest->{status}, but got $status";
1869 return undef;
1870 }
1871
1872 =head1 PRIVATE METHODS
1873
1874 You could call those directly, but you don't have to. I hope.
1875
1876 =head2 _set_info
1877
1878 Set information for node
1879
1880 $node->_set_info;
1881
1882 =cut
1883
1884 sub _set_info {
1885 my $self = shift;
1886
1887 $self->{status} = -1;
1888 return unless ($self->{url});
1889
1890 my $resbody;
1891 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1892 'text/plain',
1893 undef,
1894 \$resbody,
1895 );
1896
1897 return if ($rv != 200 || !$resbody);
1898
1899 my @lines = split(/[\r\n]/,$resbody);
1900
1901 $self->{inform} = {};
1902
1903 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1904 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1905
1906 return $resbody unless (@lines);
1907
1908 shift @lines;
1909
1910 while(my $admin = shift @lines) {
1911 push @{$self->{inform}->{admins}}, $admin;
1912 }
1913
1914 while(my $guest = shift @lines) {
1915 push @{$self->{inform}->{guests}}, $guest;
1916 }
1917
1918 while(my $link = shift @lines) {
1919 push @{$self->{inform}->{links}}, $link;
1920 }
1921
1922 return $resbody;
1923
1924 }
1925
1926 ###
1927
1928 =head1 EXPORT
1929
1930 Nothing.
1931
1932 =head1 SEE ALSO
1933
1934 L<http://hyperestraier.sourceforge.net/>
1935
1936 Hyper Estraier Ruby interface on which this module is based.
1937
1938 =head1 AUTHOR
1939
1940 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1941
1942 Robert Klep E<lt>robert@klep.nameE<gt> contributed refactored search code
1943
1944 =head1 COPYRIGHT AND LICENSE
1945
1946 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1947
1948 This library is free software; you can redistribute it and/or modify
1949 it under the GPL v2 or later.
1950
1951 =cut
1952
1953 1;

  ViewVC Help
Powered by ViewVC 1.1.26