/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 139 - (show annotations)
Wed May 10 13:45:08 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 34744 byte(s)
added create and label to new Search::Estraier::Node, so that nodes will be automatically
created if needed.
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin',
24 create => 1,
25 label => 'Label for node',
26 croak_on_error => 1,
27 );
28
29 # create document
30 my $doc = new Search::Estraier::Document;
31
32 # add attributes
33 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
34 $doc->add_attr('@title', "Over the Rainbow");
35
36 # add body text to document
37 $doc->add_text("Somewhere over the rainbow. Way up high.");
38 $doc->add_text("There's a land that I heard of once in a lullaby.");
39
40 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
41
42 =head2 Simple searcher
43
44 use Search::Estraier;
45
46 # create and configure node
47 my $node = new Search::Estraier::Node(
48 url => 'http://localhost:1978/node/test',
49 user => 'admin',
50 passwd => 'admin',
51 croak_on_error => 1,
52 );
53
54 # create condition
55 my $cond = new Search::Estraier::Condition;
56
57 # set search phrase
58 $cond->set_phrase("rainbow AND lullaby");
59
60 my $nres = $node->search($cond, 0);
61
62 if (defined($nres)) {
63 print "Got ", $nres->hits, " results\n";
64
65 # for each document in results
66 for my $i ( 0 ... $nres->doc_num - 1 ) {
67 # get result document
68 my $rdoc = $nres->get_doc($i);
69 # display attribte
70 print "URI: ", $rdoc->attr('@uri'),"\n";
71 print "Title: ", $rdoc->attr('@title'),"\n";
72 print $rdoc->snippet,"\n";
73 }
74 } else {
75 die "error: ", $node->status,"\n";
76 }
77
78 =head1 DESCRIPTION
79
80 This module is implementation of node API of Hyper Estraier. Since it's
81 perl-only module with dependencies only on standard perl modules, it will
82 run on all platforms on which perl runs. It doesn't require compilation
83 or Hyper Estraier development files on target machine.
84
85 It is implemented as multiple packages which closly resamble Ruby
86 implementation. It also includes methods to manage nodes.
87
88 There are few examples in C<scripts> directory of this distribution.
89
90 =cut
91
92 =head1 Inheritable common methods
93
94 This methods should really move somewhere else.
95
96 =head2 _s
97
98 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
99
100 my $text = $self->_s(" this is a text ");
101 $text = 'this is a text';
102
103 =cut
104
105 sub _s {
106 my $text = $_[1];
107 return unless defined($text);
108 $text =~ s/\s\s+/ /gs;
109 $text =~ s/^\s+//;
110 $text =~ s/\s+$//;
111 return $text;
112 }
113
114 package Search::Estraier::Document;
115
116 use Carp qw/croak confess/;
117
118 use Search::Estraier;
119 our @ISA = qw/Search::Estraier/;
120
121 =head1 Search::Estraier::Document
122
123 This class implements Document which is collection of attributes
124 (key=value), vectors (also key value) display text and hidden text.
125
126
127 =head2 new
128
129 Create new document, empty or from draft.
130
131 my $doc = new Search::HyperEstraier::Document;
132 my $doc2 = new Search::HyperEstraier::Document( $draft );
133
134 =cut
135
136 sub new {
137 my $class = shift;
138 my $self = {};
139 bless($self, $class);
140
141 $self->{id} = -1;
142
143 my $draft = shift;
144
145 if ($draft) {
146 my $in_text = 0;
147 foreach my $line (split(/\n/, $draft)) {
148
149 if ($in_text) {
150 if ($line =~ /^\t/) {
151 push @{ $self->{htexts} }, substr($line, 1);
152 } else {
153 push @{ $self->{dtexts} }, $line;
154 }
155 next;
156 }
157
158 if ($line =~ m/^%VECTOR\t(.+)$/) {
159 my @fields = split(/\t/, $1);
160 for my $i ( 0 .. ($#fields - 1) ) {
161 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
162 $i++;
163 }
164 next;
165 } elsif ($line =~ m/^%/) {
166 # What is this? comment?
167 #warn "$line\n";
168 next;
169 } elsif ($line =~ m/^$/) {
170 $in_text = 1;
171 next;
172 } elsif ($line =~ m/^(.+)=(.*)$/) {
173 $self->{attrs}->{ $1 } = $2;
174 next;
175 }
176
177 warn "draft ignored: '$line'\n";
178 }
179 }
180
181 $self ? return $self : return undef;
182 }
183
184
185 =head2 add_attr
186
187 Add an attribute.
188
189 $doc->add_attr( name => 'value' );
190
191 Delete attribute using
192
193 $doc->add_attr( name => undef );
194
195 =cut
196
197 sub add_attr {
198 my $self = shift;
199 my $attrs = {@_};
200
201 while (my ($name, $value) = each %{ $attrs }) {
202 if (! defined($value)) {
203 delete( $self->{attrs}->{ $self->_s($name) } );
204 } else {
205 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
206 }
207 }
208
209 return 1;
210 }
211
212
213 =head2 add_text
214
215 Add a sentence of text.
216
217 $doc->add_text('this is example text to display');
218
219 =cut
220
221 sub add_text {
222 my $self = shift;
223 my $text = shift;
224 return unless defined($text);
225
226 push @{ $self->{dtexts} }, $self->_s($text);
227 }
228
229
230 =head2 add_hidden_text
231
232 Add a hidden sentence.
233
234 $doc->add_hidden_text('this is example text just for search');
235
236 =cut
237
238 sub add_hidden_text {
239 my $self = shift;
240 my $text = shift;
241 return unless defined($text);
242
243 push @{ $self->{htexts} }, $self->_s($text);
244 }
245
246
247 =head2 id
248
249 Get the ID number of document. If the object has never been registred, C<-1> is returned.
250
251 print $doc->id;
252
253 =cut
254
255 sub id {
256 my $self = shift;
257 return $self->{id};
258 }
259
260
261 =head2 attr_names
262
263 Returns array with attribute names from document object.
264
265 my @attrs = $doc->attr_names;
266
267 =cut
268
269 sub attr_names {
270 my $self = shift;
271 return unless ($self->{attrs});
272 #croak "attr_names return array, not scalar" if (! wantarray);
273 return sort keys %{ $self->{attrs} };
274 }
275
276
277 =head2 attr
278
279 Returns value of an attribute.
280
281 my $value = $doc->attr( 'attribute' );
282
283 =cut
284
285 sub attr {
286 my $self = shift;
287 my $name = shift;
288 return unless (defined($name) && $self->{attrs});
289 return $self->{attrs}->{ $name };
290 }
291
292
293 =head2 texts
294
295 Returns array with text sentences.
296
297 my @texts = $doc->texts;
298
299 =cut
300
301 sub texts {
302 my $self = shift;
303 #confess "texts return array, not scalar" if (! wantarray);
304 return @{ $self->{dtexts} } if ($self->{dtexts});
305 }
306
307
308 =head2 cat_texts
309
310 Return whole text as single scalar.
311
312 my $text = $doc->cat_texts;
313
314 =cut
315
316 sub cat_texts {
317 my $self = shift;
318 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
319 }
320
321
322 =head2 dump_draft
323
324 Dump draft data from document object.
325
326 print $doc->dump_draft;
327
328 =cut
329
330 sub dump_draft {
331 my $self = shift;
332 my $draft;
333
334 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
335 next unless defined(my $v = $self->{attrs}->{$attr_name});
336 $draft .= $attr_name . '=' . $v . "\n";
337 }
338
339 if ($self->{kwords}) {
340 $draft .= '%%VECTOR';
341 while (my ($key, $value) = each %{ $self->{kwords} }) {
342 $draft .= "\t$key\t$value";
343 }
344 $draft .= "\n";
345 }
346
347 $draft .= "\n";
348
349 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
350 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
351
352 return $draft;
353 }
354
355
356 =head2 delete
357
358 Empty document object
359
360 $doc->delete;
361
362 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
363 convinience. Document objects which go out of scope will be destroyed
364 automatically.
365
366 =cut
367
368 sub delete {
369 my $self = shift;
370
371 foreach my $data (qw/attrs dtexts stexts kwords/) {
372 delete($self->{$data});
373 }
374
375 $self->{id} = -1;
376
377 return 1;
378 }
379
380
381
382 package Search::Estraier::Condition;
383
384 use Carp qw/carp confess croak/;
385
386 use Search::Estraier;
387 our @ISA = qw/Search::Estraier/;
388
389 =head1 Search::Estraier::Condition
390
391 =head2 new
392
393 my $cond = new Search::HyperEstraier::Condition;
394
395 =cut
396
397 sub new {
398 my $class = shift;
399 my $self = {};
400 bless($self, $class);
401
402 $self->{max} = -1;
403 $self->{options} = 0;
404
405 $self ? return $self : return undef;
406 }
407
408
409 =head2 set_phrase
410
411 $cond->set_phrase('search phrase');
412
413 =cut
414
415 sub set_phrase {
416 my $self = shift;
417 $self->{phrase} = $self->_s( shift );
418 }
419
420
421 =head2 add_attr
422
423 $cond->add_attr('@URI STRINC /~dpavlin/');
424
425 =cut
426
427 sub add_attr {
428 my $self = shift;
429 my $attr = shift || return;
430 push @{ $self->{attrs} }, $self->_s( $attr );
431 }
432
433
434 =head2 set_order
435
436 $cond->set_order('@mdate NUMD');
437
438 =cut
439
440 sub set_order {
441 my $self = shift;
442 $self->{order} = shift;
443 }
444
445
446 =head2 set_max
447
448 $cond->set_max(42);
449
450 =cut
451
452 sub set_max {
453 my $self = shift;
454 my $max = shift;
455 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
456 $self->{max} = $max;
457 }
458
459
460 =head2 set_options
461
462 $cond->set_options( 'SURE' );
463
464 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
465
466 Possible options are:
467
468 =over 8
469
470 =item SURE
471
472 check every N-gram
473
474 =item USUAL
475
476 check every second N-gram
477
478 =item FAST
479
480 check every third N-gram
481
482 =item AGITO
483
484 check every fourth N-gram
485
486 =item NOIDF
487
488 don't perform TF-IDF tuning
489
490 =item SIMPLE
491
492 use simplified query phrase
493
494 =back
495
496 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
497 options;
498
499 This option changed in version C<0.04> of this module. It's backwards compatibile.
500
501 =cut
502
503 my $options = {
504 SURE => 1 << 0,
505 USUAL => 1 << 1,
506 FAST => 1 << 2,
507 AGITO => 1 << 3,
508 NOIDF => 1 << 4,
509 SIMPLE => 1 << 10,
510 };
511
512 sub set_options {
513 my $self = shift;
514 my $opt = 0;
515 foreach my $option (@_) {
516 my $mask;
517 unless ($mask = $options->{$option}) {
518 if ($option eq '1') {
519 next;
520 } else {
521 croak "unknown option $option";
522 }
523 }
524 $opt += $mask;
525 }
526 $self->{options} = $opt;
527 }
528
529
530 =head2 phrase
531
532 Return search phrase.
533
534 print $cond->phrase;
535
536 =cut
537
538 sub phrase {
539 my $self = shift;
540 return $self->{phrase};
541 }
542
543
544 =head2 order
545
546 Return search result order.
547
548 print $cond->order;
549
550 =cut
551
552 sub order {
553 my $self = shift;
554 return $self->{order};
555 }
556
557
558 =head2 attrs
559
560 Return search result attrs.
561
562 my @cond_attrs = $cond->attrs;
563
564 =cut
565
566 sub attrs {
567 my $self = shift;
568 #croak "attrs return array, not scalar" if (! wantarray);
569 return @{ $self->{attrs} } if ($self->{attrs});
570 }
571
572
573 =head2 max
574
575 Return maximum number of results.
576
577 print $cond->max;
578
579 C<-1> is returned for unitialized value, C<0> is unlimited.
580
581 =cut
582
583 sub max {
584 my $self = shift;
585 return $self->{max};
586 }
587
588
589 =head2 options
590
591 Return options for this condition.
592
593 print $cond->options;
594
595 Options are returned in numerical form.
596
597 =cut
598
599 sub options {
600 my $self = shift;
601 return $self->{options};
602 }
603
604
605 =head2 set_skip
606
607 Set number of skipped documents from beginning of results
608
609 $cond->set_skip(42);
610
611 Similar to C<offset> in RDBMS.
612
613 =cut
614
615 sub set_skip {
616 my $self = shift;
617 $self->{skip} = shift;
618 }
619
620 =head2 skip
621
622 Return skip for this condition.
623
624 print $cond->skip;
625
626 =cut
627
628 sub skip {
629 my $self = shift;
630 return $self->{skip};
631 }
632
633
634 package Search::Estraier::ResultDocument;
635
636 use Carp qw/croak/;
637
638 #use Search::Estraier;
639 #our @ISA = qw/Search::Estraier/;
640
641 =head1 Search::Estraier::ResultDocument
642
643 =head2 new
644
645 my $rdoc = new Search::HyperEstraier::ResultDocument(
646 uri => 'http://localhost/document/uri/42',
647 attrs => {
648 foo => 1,
649 bar => 2,
650 },
651 snippet => 'this is a text of snippet'
652 keywords => 'this\tare\tkeywords'
653 );
654
655 =cut
656
657 sub new {
658 my $class = shift;
659 my $self = {@_};
660 bless($self, $class);
661
662 croak "missing uri for ResultDocument" unless defined($self->{uri});
663
664 $self ? return $self : return undef;
665 }
666
667
668 =head2 uri
669
670 Return URI of result document
671
672 print $rdoc->uri;
673
674 =cut
675
676 sub uri {
677 my $self = shift;
678 return $self->{uri};
679 }
680
681
682 =head2 attr_names
683
684 Returns array with attribute names from result document object.
685
686 my @attrs = $rdoc->attr_names;
687
688 =cut
689
690 sub attr_names {
691 my $self = shift;
692 croak "attr_names return array, not scalar" if (! wantarray);
693 return sort keys %{ $self->{attrs} };
694 }
695
696
697 =head2 attr
698
699 Returns value of an attribute.
700
701 my $value = $rdoc->attr( 'attribute' );
702
703 =cut
704
705 sub attr {
706 my $self = shift;
707 my $name = shift || return;
708 return $self->{attrs}->{ $name };
709 }
710
711
712 =head2 snippet
713
714 Return snippet from result document
715
716 print $rdoc->snippet;
717
718 =cut
719
720 sub snippet {
721 my $self = shift;
722 return $self->{snippet};
723 }
724
725
726 =head2 keywords
727
728 Return keywords from result document
729
730 print $rdoc->keywords;
731
732 =cut
733
734 sub keywords {
735 my $self = shift;
736 return $self->{keywords};
737 }
738
739
740 package Search::Estraier::NodeResult;
741
742 use Carp qw/croak/;
743
744 #use Search::Estraier;
745 #our @ISA = qw/Search::Estraier/;
746
747 =head1 Search::Estraier::NodeResult
748
749 =head2 new
750
751 my $res = new Search::HyperEstraier::NodeResult(
752 docs => @array_of_rdocs,
753 hits => %hash_with_hints,
754 );
755
756 =cut
757
758 sub new {
759 my $class = shift;
760 my $self = {@_};
761 bless($self, $class);
762
763 foreach my $f (qw/docs hints/) {
764 croak "missing $f for ResultDocument" unless defined($self->{$f});
765 }
766
767 $self ? return $self : return undef;
768 }
769
770
771 =head2 doc_num
772
773 Return number of documents
774
775 print $res->doc_num;
776
777 This will return real number of documents (limited by C<max>).
778 If you want to get total number of hits, see C<hits>.
779
780 =cut
781
782 sub doc_num {
783 my $self = shift;
784 return $#{$self->{docs}} + 1;
785 }
786
787
788 =head2 get_doc
789
790 Return single document
791
792 my $doc = $res->get_doc( 42 );
793
794 Returns undef if document doesn't exist.
795
796 =cut
797
798 sub get_doc {
799 my $self = shift;
800 my $num = shift;
801 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
802 return undef if ($num < 0 || $num > $self->{docs});
803 return $self->{docs}->[$num];
804 }
805
806
807 =head2 hint
808
809 Return specific hint from results.
810
811 print $res->hint( 'VERSION' );
812
813 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
814 C<TIME>, C<LINK#n>, C<VIEW>.
815
816 =cut
817
818 sub hint {
819 my $self = shift;
820 my $key = shift || return;
821 return $self->{hints}->{$key};
822 }
823
824 =head2 hints
825
826 More perlish version of C<hint>. This one returns hash.
827
828 my %hints = $res->hints;
829
830 =cut
831
832 sub hints {
833 my $self = shift;
834 return $self->{hints};
835 }
836
837 =head2 hits
838
839 Syntaxtic sugar for total number of hits for this query
840
841 print $res->hits;
842
843 It's same as
844
845 print $res->hint('HIT');
846
847 but shorter.
848
849 =cut
850
851 sub hits {
852 my $self = shift;
853 return $self->{hints}->{'HIT'} || 0;
854 }
855
856 package Search::Estraier::Node;
857
858 use Carp qw/carp croak confess/;
859 use URI;
860 use MIME::Base64;
861 use IO::Socket::INET;
862 use URI::Escape qw/uri_escape/;
863
864 =head1 Search::Estraier::Node
865
866 =head2 new
867
868 my $node = new Search::HyperEstraier::Node;
869
870 or optionally with C<url> as parametar
871
872 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
873
874 or in more verbose form
875
876 my $node = new Search::HyperEstraier::Node(
877 url => 'http://localhost:1978/node/test',
878 user => 'admin',
879 passwd => 'admin'
880 create => 1,
881 label => 'optional node label',
882 debug => 1,
883 croak_on_error => 1
884 );
885
886 with following arguments:
887
888 =over 4
889
890 =item url
891
892 URL to node
893
894 =item user
895
896 specify username for node server authentication
897
898 =item passwd
899
900 password for authentication
901
902 =item create
903
904 create node if it doesn't exists
905
906 =item label
907
908 optional label for new node if C<create> is used
909
910 =item debug
911
912 dumps a B<lot> of debugging output
913
914 =item croak_on_error
915
916 very helpful during development. It will croak on all errors instead of
917 silently returning C<-1> (which is convention of Hyper Estraier API in other
918 languages).
919
920 =back
921
922 =cut
923
924 sub new {
925 my $class = shift;
926 my $self = {
927 pxport => -1,
928 timeout => 0, # this used to be -1
929 wwidth => 480,
930 hwidth => 96,
931 awidth => 96,
932 status => -1,
933 };
934
935 bless($self, $class);
936
937 if ($#_ == 0) {
938 $self->{url} = shift;
939 } else {
940 %$self = ( %$self, @_ );
941
942 $self->set_auth( $self->{user}, $self->{passwd} ) if ($self->{user});
943
944 warn "## Node debug on\n" if ($self->{debug});
945 }
946
947 $self->{inform} = {
948 dnum => -1,
949 wnum => -1,
950 size => -1.0,
951 };
952
953 if ($self->{create}) {
954 eval {
955 $self->name;
956 };
957 if ($@) {
958 my $name = $1 if ($self->{url} =~ m#/node/([^/]+)/*#);
959 croak "can't find node name in '$self->{url}'" unless ($name);
960 my $label = $self->{label} || $name;
961 $self->master(
962 action => 'nodeadd',
963 name => $name,
964 label => $label,
965 ) || croak "can't create node $name ($label)";
966 }
967 }
968
969 $self ? return $self : return undef;
970 }
971
972
973 =head2 set_url
974
975 Specify URL to node server
976
977 $node->set_url('http://localhost:1978');
978
979 =cut
980
981 sub set_url {
982 my $self = shift;
983 $self->{url} = shift;
984 }
985
986
987 =head2 set_proxy
988
989 Specify proxy server to connect to node server
990
991 $node->set_proxy('proxy.example.com', 8080);
992
993 =cut
994
995 sub set_proxy {
996 my $self = shift;
997 my ($host,$port) = @_;
998 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
999 $self->{pxhost} = $host;
1000 $self->{pxport} = $port;
1001 }
1002
1003
1004 =head2 set_timeout
1005
1006 Specify timeout of connection in seconds
1007
1008 $node->set_timeout( 15 );
1009
1010 =cut
1011
1012 sub set_timeout {
1013 my $self = shift;
1014 my $sec = shift;
1015 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
1016 $self->{timeout} = $sec;
1017 }
1018
1019
1020 =head2 set_auth
1021
1022 Specify name and password for authentication to node server.
1023
1024 $node->set_auth('clint','eastwood');
1025
1026 =cut
1027
1028 sub set_auth {
1029 my $self = shift;
1030 my ($login,$passwd) = @_;
1031 my $basic_auth = encode_base64( "$login:$passwd" );
1032 chomp($basic_auth);
1033 $self->{auth} = $basic_auth;
1034 }
1035
1036
1037 =head2 status
1038
1039 Return status code of last request.
1040
1041 print $node->status;
1042
1043 C<-1> means connection failure.
1044
1045 =cut
1046
1047 sub status {
1048 my $self = shift;
1049 return $self->{status};
1050 }
1051
1052
1053 =head2 put_doc
1054
1055 Add a document
1056
1057 $node->put_doc( $document_draft ) or die "can't add document";
1058
1059 Return true on success or false on failture.
1060
1061 =cut
1062
1063 sub put_doc {
1064 my $self = shift;
1065 my $doc = shift || return;
1066 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1067 $self->shuttle_url( $self->{url} . '/put_doc',
1068 'text/x-estraier-draft',
1069 $doc->dump_draft,
1070 undef
1071 ) == 200;
1072 }
1073
1074
1075 =head2 out_doc
1076
1077 Remove a document
1078
1079 $node->out_doc( document_id ) or "can't remove document";
1080
1081 Return true on success or false on failture.
1082
1083 =cut
1084
1085 sub out_doc {
1086 my $self = shift;
1087 my $id = shift || return;
1088 return unless ($self->{url});
1089 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1090 $self->shuttle_url( $self->{url} . '/out_doc',
1091 'application/x-www-form-urlencoded',
1092 "id=$id",
1093 undef
1094 ) == 200;
1095 }
1096
1097
1098 =head2 out_doc_by_uri
1099
1100 Remove a registrated document using it's uri
1101
1102 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1103
1104 Return true on success or false on failture.
1105
1106 =cut
1107
1108 sub out_doc_by_uri {
1109 my $self = shift;
1110 my $uri = shift || return;
1111 return unless ($self->{url});
1112 $self->shuttle_url( $self->{url} . '/out_doc',
1113 'application/x-www-form-urlencoded',
1114 "uri=" . uri_escape($uri),
1115 undef
1116 ) == 200;
1117 }
1118
1119
1120 =head2 edit_doc
1121
1122 Edit attributes of a document
1123
1124 $node->edit_doc( $document_draft ) or die "can't edit document";
1125
1126 Return true on success or false on failture.
1127
1128 =cut
1129
1130 sub edit_doc {
1131 my $self = shift;
1132 my $doc = shift || return;
1133 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1134 $self->shuttle_url( $self->{url} . '/edit_doc',
1135 'text/x-estraier-draft',
1136 $doc->dump_draft,
1137 undef
1138 ) == 200;
1139 }
1140
1141
1142 =head2 get_doc
1143
1144 Retreive document
1145
1146 my $doc = $node->get_doc( document_id ) or die "can't get document";
1147
1148 Return true on success or false on failture.
1149
1150 =cut
1151
1152 sub get_doc {
1153 my $self = shift;
1154 my $id = shift || return;
1155 return $self->_fetch_doc( id => $id );
1156 }
1157
1158
1159 =head2 get_doc_by_uri
1160
1161 Retreive document
1162
1163 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1164
1165 Return true on success or false on failture.
1166
1167 =cut
1168
1169 sub get_doc_by_uri {
1170 my $self = shift;
1171 my $uri = shift || return;
1172 return $self->_fetch_doc( uri => $uri );
1173 }
1174
1175
1176 =head2 get_doc_attr
1177
1178 Retrieve the value of an atribute from object
1179
1180 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1181 die "can't get document attribute";
1182
1183 =cut
1184
1185 sub get_doc_attr {
1186 my $self = shift;
1187 my ($id,$name) = @_;
1188 return unless ($id && $name);
1189 return $self->_fetch_doc( id => $id, attr => $name );
1190 }
1191
1192
1193 =head2 get_doc_attr_by_uri
1194
1195 Retrieve the value of an atribute from object
1196
1197 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1198 die "can't get document attribute";
1199
1200 =cut
1201
1202 sub get_doc_attr_by_uri {
1203 my $self = shift;
1204 my ($uri,$name) = @_;
1205 return unless ($uri && $name);
1206 return $self->_fetch_doc( uri => $uri, attr => $name );
1207 }
1208
1209
1210 =head2 etch_doc
1211
1212 Exctract document keywords
1213
1214 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1215
1216 =cut
1217
1218 sub etch_doc {
1219 my $self = shift;
1220 my $id = shift || return;
1221 return $self->_fetch_doc( id => $id, etch => 1 );
1222 }
1223
1224 =head2 etch_doc_by_uri
1225
1226 Retreive document
1227
1228 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1229
1230 Return true on success or false on failture.
1231
1232 =cut
1233
1234 sub etch_doc_by_uri {
1235 my $self = shift;
1236 my $uri = shift || return;
1237 return $self->_fetch_doc( uri => $uri, etch => 1 );
1238 }
1239
1240
1241 =head2 uri_to_id
1242
1243 Get ID of document specified by URI
1244
1245 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1246
1247 This method won't croak, even if using C<croak_on_error>.
1248
1249 =cut
1250
1251 sub uri_to_id {
1252 my $self = shift;
1253 my $uri = shift || return;
1254 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1255 }
1256
1257
1258 =head2 _fetch_doc
1259
1260 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1261 C<etch_doc>, C<etch_doc_by_uri>.
1262
1263 # this will decode received draft into Search::Estraier::Document object
1264 my $doc = $node->_fetch_doc( id => 42 );
1265 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1266
1267 # to extract keywords, add etch
1268 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1269 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1270
1271 # to get document attrubute add attr
1272 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1273 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1274
1275 # more general form which allows implementation of
1276 # uri_to_id
1277 my $id = $node->_fetch_doc(
1278 uri => 'file:///document/uri/42',
1279 path => '/uri_to_id',
1280 chomp_resbody => 1
1281 );
1282
1283 =cut
1284
1285 sub _fetch_doc {
1286 my $self = shift;
1287 my $a = {@_};
1288 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1289
1290 my ($arg, $resbody);
1291
1292 my $path = $a->{path} || '/get_doc';
1293 $path = '/etch_doc' if ($a->{etch});
1294
1295 if ($a->{id}) {
1296 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1297 $arg = 'id=' . $a->{id};
1298 } elsif ($a->{uri}) {
1299 $arg = 'uri=' . uri_escape($a->{uri});
1300 } else {
1301 confess "unhandled argument. Need id or uri.";
1302 }
1303
1304 if ($a->{attr}) {
1305 $path = '/get_doc_attr';
1306 $arg .= '&attr=' . uri_escape($a->{attr});
1307 $a->{chomp_resbody} = 1;
1308 }
1309
1310 my $rv = $self->shuttle_url( $self->{url} . $path,
1311 'application/x-www-form-urlencoded',
1312 $arg,
1313 \$resbody,
1314 $a->{croak_on_error},
1315 );
1316
1317 return if ($rv != 200);
1318
1319 if ($a->{etch}) {
1320 $self->{kwords} = {};
1321 return +{} unless ($resbody);
1322 foreach my $l (split(/\n/, $resbody)) {
1323 my ($k,$v) = split(/\t/, $l, 2);
1324 $self->{kwords}->{$k} = $v if ($v);
1325 }
1326 return $self->{kwords};
1327 } elsif ($a->{chomp_resbody}) {
1328 return unless (defined($resbody));
1329 chomp($resbody);
1330 return $resbody;
1331 } else {
1332 return new Search::Estraier::Document($resbody);
1333 }
1334 }
1335
1336
1337 =head2 name
1338
1339 my $node_name = $node->name;
1340
1341 =cut
1342
1343 sub name {
1344 my $self = shift;
1345 $self->_set_info unless ($self->{inform}->{name});
1346 return $self->{inform}->{name};
1347 }
1348
1349
1350 =head2 label
1351
1352 my $node_label = $node->label;
1353
1354 =cut
1355
1356 sub label {
1357 my $self = shift;
1358 $self->_set_info unless ($self->{inform}->{label});
1359 return $self->{inform}->{label};
1360 }
1361
1362
1363 =head2 doc_num
1364
1365 my $documents_in_node = $node->doc_num;
1366
1367 =cut
1368
1369 sub doc_num {
1370 my $self = shift;
1371 $self->_set_info if ($self->{inform}->{dnum} < 0);
1372 return $self->{inform}->{dnum};
1373 }
1374
1375
1376 =head2 word_num
1377
1378 my $words_in_node = $node->word_num;
1379
1380 =cut
1381
1382 sub word_num {
1383 my $self = shift;
1384 $self->_set_info if ($self->{inform}->{wnum} < 0);
1385 return $self->{inform}->{wnum};
1386 }
1387
1388
1389 =head2 size
1390
1391 my $node_size = $node->size;
1392
1393 =cut
1394
1395 sub size {
1396 my $self = shift;
1397 $self->_set_info if ($self->{inform}->{size} < 0);
1398 return $self->{inform}->{size};
1399 }
1400
1401
1402 =head2 search
1403
1404 Search documents which match condition
1405
1406 my $nres = $node->search( $cond, $depth );
1407
1408 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1409 depth for meta search.
1410
1411 Function results C<Search::Estraier::NodeResult> object.
1412
1413 =cut
1414
1415 sub search {
1416 my $self = shift;
1417 my ($cond, $depth) = @_;
1418 return unless ($cond && defined($depth) && $self->{url});
1419 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1420 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1421
1422 my $resbody;
1423
1424 my $rv = $self->shuttle_url( $self->{url} . '/search',
1425 'application/x-www-form-urlencoded',
1426 $self->cond_to_query( $cond, $depth ),
1427 \$resbody,
1428 );
1429 return if ($rv != 200);
1430
1431 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1432 my $hintsText = splice @records, 0, 2; # starts with empty record
1433 my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1434
1435 # process records
1436 my $docs = [];
1437 foreach my $record (@records)
1438 {
1439 # split into keys and snippets
1440 my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1441
1442 # create document hash
1443 my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1444 $doc->{'@keywords'} = $doc->{keywords};
1445 ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1446 $doc->{snippet} = $snippet;
1447
1448 push @$docs, new Search::Estraier::ResultDocument(
1449 attrs => $doc,
1450 uri => $doc->{'@uri'},
1451 snippet => $snippet,
1452 keywords => $doc->{'keywords'},
1453 );
1454 }
1455
1456 return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1457 }
1458
1459
1460 =head2 cond_to_query
1461
1462 Return URI encoded string generated from Search::Estraier::Condition
1463
1464 my $args = $node->cond_to_query( $cond, $depth );
1465
1466 =cut
1467
1468 sub cond_to_query {
1469 my $self = shift;
1470
1471 my $cond = shift || return;
1472 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1473 my $depth = shift;
1474
1475 my @args;
1476
1477 if (my $phrase = $cond->phrase) {
1478 push @args, 'phrase=' . uri_escape($phrase);
1479 }
1480
1481 if (my @attrs = $cond->attrs) {
1482 for my $i ( 0 .. $#attrs ) {
1483 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1484 }
1485 }
1486
1487 if (my $order = $cond->order) {
1488 push @args, 'order=' . uri_escape($order);
1489 }
1490
1491 if (my $max = $cond->max) {
1492 push @args, 'max=' . $max;
1493 } else {
1494 push @args, 'max=' . (1 << 30);
1495 }
1496
1497 if (my $options = $cond->options) {
1498 push @args, 'options=' . $options;
1499 }
1500
1501 push @args, 'depth=' . $depth if ($depth);
1502 push @args, 'wwidth=' . $self->{wwidth};
1503 push @args, 'hwidth=' . $self->{hwidth};
1504 push @args, 'awidth=' . $self->{awidth};
1505 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1506
1507 return join('&', @args);
1508 }
1509
1510
1511 =head2 shuttle_url
1512
1513 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1514 master.
1515
1516 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1517
1518 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1519 body will be saved within object.
1520
1521 =cut
1522
1523 use LWP::UserAgent;
1524
1525 sub shuttle_url {
1526 my $self = shift;
1527
1528 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1529
1530 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1531
1532 $self->{status} = -1;
1533
1534 warn "## $url\n" if ($self->{debug});
1535
1536 $url = new URI($url);
1537 if (
1538 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1539 !$url->host || !$url->port || $url->port < 1
1540 ) {
1541 carp "can't parse $url\n";
1542 return -1;
1543 }
1544
1545 my $ua = LWP::UserAgent->new;
1546 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1547
1548 my $req;
1549 if ($reqbody) {
1550 $req = HTTP::Request->new(POST => $url);
1551 } else {
1552 $req = HTTP::Request->new(GET => $url);
1553 }
1554
1555 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1556 $req->headers->header( 'Connection', 'close' );
1557 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1558 $req->content_type( $content_type );
1559
1560 warn $req->headers->as_string,"\n" if ($self->{debug});
1561
1562 if ($reqbody) {
1563 warn "$reqbody\n" if ($self->{debug});
1564 $req->content( $reqbody );
1565 }
1566
1567 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1568
1569 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1570
1571 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1572
1573 if (! $res->is_success) {
1574 if ($croak_on_error) {
1575 croak("can't get $url: ",$res->status_line);
1576 } else {
1577 return -1;
1578 }
1579 }
1580
1581 $$resbody .= $res->content;
1582
1583 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1584
1585 return $self->{status};
1586 }
1587
1588
1589 =head2 set_snippet_width
1590
1591 Set width of snippets in results
1592
1593 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1594
1595 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1596 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1597
1598 C<$hwidth> specified width of strings from beginning of string. Default
1599 value is C<96>. Negative or zero value keep previous value.
1600
1601 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1602 If negative of zero value is provided previous value is kept unchanged.
1603
1604 =cut
1605
1606 sub set_snippet_width {
1607 my $self = shift;
1608
1609 my ($wwidth, $hwidth, $awidth) = @_;
1610 $self->{wwidth} = $wwidth;
1611 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1612 $self->{awidth} = $awidth if ($awidth >= 0);
1613 }
1614
1615
1616 =head2 set_user
1617
1618 Manage users of node
1619
1620 $node->set_user( 'name', $mode );
1621
1622 C<$mode> can be one of:
1623
1624 =over 4
1625
1626 =item 0
1627
1628 delete account
1629
1630 =item 1
1631
1632 set administrative right for user
1633
1634 =item 2
1635
1636 set user account as guest
1637
1638 =back
1639
1640 Return true on success, otherwise false.
1641
1642 =cut
1643
1644 sub set_user {
1645 my $self = shift;
1646 my ($name, $mode) = @_;
1647
1648 return unless ($self->{url});
1649 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1650
1651 $self->shuttle_url( $self->{url} . '/_set_user',
1652 'text/plain',
1653 'name=' . uri_escape($name) . '&mode=' . $mode,
1654 undef
1655 ) == 200;
1656 }
1657
1658
1659 =head2 set_link
1660
1661 Manage node links
1662
1663 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1664
1665 If C<$credit> is negative, link is removed.
1666
1667 =cut
1668
1669 sub set_link {
1670 my $self = shift;
1671 my ($url, $label, $credit) = @_;
1672
1673 return unless ($self->{url});
1674 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1675
1676 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1677 $reqbody .= '&credit=' . $credit if ($credit > 0);
1678
1679 if ($self->shuttle_url( $self->{url} . '/_set_link',
1680 'application/x-www-form-urlencoded',
1681 $reqbody,
1682 undef
1683 ) == 200) {
1684 # refresh node info after adding link
1685 $self->_set_info;
1686 return 1;
1687 }
1688 }
1689
1690 =head2 admins
1691
1692 my @admins = @{ $node->admins };
1693
1694 Return array of users with admin rights on node
1695
1696 =cut
1697
1698 sub admins {
1699 my $self = shift;
1700 $self->_set_info unless ($self->{inform}->{name});
1701 return $self->{inform}->{admins};
1702 }
1703
1704 =head2 guests
1705
1706 my @guests = @{ $node->guests };
1707
1708 Return array of users with guest rights on node
1709
1710 =cut
1711
1712 sub guests {
1713 my $self = shift;
1714 $self->_set_info unless ($self->{inform}->{name});
1715 return $self->{inform}->{guests};
1716 }
1717
1718 =head2 links
1719
1720 my $links = @{ $node->links };
1721
1722 Return array of links for this node
1723
1724 =cut
1725
1726 sub links {
1727 my $self = shift;
1728 $self->_set_info unless ($self->{inform}->{name});
1729 return $self->{inform}->{links};
1730 }
1731
1732 =head2 master
1733
1734 Set actions on Hyper Estraier node master (C<estmaster> process)
1735
1736 $node->master(
1737 action => 'sync'
1738 );
1739
1740 All available actions are documented in
1741 L<http://hyperestraier.sourceforge.net/nguide-en.html#protocol>
1742
1743 =cut
1744
1745 my $estmaster_rest = {
1746 shutdown => {
1747 status => 202,
1748 },
1749 sync => {
1750 status => 202,
1751 },
1752 backup => {
1753 status => 202,
1754 },
1755 userlist => {
1756 status => 200,
1757 returns => [ qw/name passwd flags fname misc/ ],
1758 },
1759 useradd => {
1760 required => [ qw/name passwd flags/ ],
1761 optional => [ qw/fname misc/ ],
1762 status => 200,
1763 },
1764 userdel => {
1765 required => [ qw/name/ ],
1766 status => 200,
1767 },
1768 nodelist => {
1769 status => 200,
1770 returns => [ qw/name label doc_num word_num size/ ],
1771 },
1772 nodeadd => {
1773 required => [ qw/name/ ],
1774 optional => [ qw/label/ ],
1775 status => 200,
1776 },
1777 nodedel => {
1778 required => [ qw/name/ ],
1779 status => 200,
1780 },
1781 nodeclr => {
1782 required => [ qw/name/ ],
1783 status => 200,
1784 },
1785 nodertt => {
1786 status => 200,
1787 },
1788 };
1789
1790 sub master {
1791 my $self = shift;
1792
1793 my $args = {@_};
1794
1795 # have action?
1796 my $action = $args->{action} || croak "need action, available: ",
1797 join(", ",keys %{ $estmaster_rest });
1798
1799 # check if action is valid
1800 my $rest = $estmaster_rest->{$action};
1801 croak "action '$action' is not supported, available actions: ",
1802 join(", ",keys %{ $estmaster_rest }) unless ($rest);
1803
1804 croak "BUG: action '$action' needs return status" unless ($rest->{status});
1805
1806 my @args;
1807
1808 if ($rest->{required} || $rest->{optional}) {
1809
1810 map {
1811 croak "need parametar '$_' for action '$action'" unless ($args->{$_});
1812 push @args, $_ . '=' . uri_escape( $args->{$_} );
1813 } ( @{ $rest->{required} } );
1814
1815 map {
1816 push @args, $_ . '=' . uri_escape( $args->{$_} ) if ($args->{$_});
1817 } ( @{ $rest->{optional} } );
1818
1819 }
1820
1821 my $uri = new URI( $self->{url} );
1822
1823 my $resbody;
1824
1825 my $status = $self->shuttle_url(
1826 'http://' . $uri->host_port . '/master?action=' . $action ,
1827 'application/x-www-form-urlencoded',
1828 join('&', @args),
1829 \$resbody,
1830 1,
1831 ) or confess "shuttle_url failed";
1832
1833 if ($status == $rest->{status}) {
1834 if ($rest->{returns} && wantarray) {
1835
1836 my @results;
1837 my $fields = $#{$rest->{returns}};
1838
1839 foreach my $line ( split(/[\r\n]/,$resbody) ) {
1840 my @e = split(/\t/, $line, $fields + 1);
1841 my $row;
1842 foreach my $i ( 0 .. $fields) {
1843 $row->{ $rest->{returns}->[$i] } = $e[ $i ];
1844 }
1845 push @results, $row;
1846 }
1847
1848 return @results;
1849
1850 } elsif ($resbody) {
1851 chomp $resbody;
1852 return $resbody;
1853 } else {
1854 return 0E0;
1855 }
1856 }
1857
1858 carp "expected status $rest->{status}, but got $status";
1859 return undef;
1860 }
1861
1862 =head1 PRIVATE METHODS
1863
1864 You could call those directly, but you don't have to. I hope.
1865
1866 =head2 _set_info
1867
1868 Set information for node
1869
1870 $node->_set_info;
1871
1872 =cut
1873
1874 sub _set_info {
1875 my $self = shift;
1876
1877 $self->{status} = -1;
1878 return unless ($self->{url});
1879
1880 my $resbody;
1881 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1882 'text/plain',
1883 undef,
1884 \$resbody,
1885 );
1886
1887 return if ($rv != 200 || !$resbody);
1888
1889 my @lines = split(/[\r\n]/,$resbody);
1890
1891 $self->{inform} = {};
1892
1893 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1894 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1895
1896 return $resbody unless (@lines);
1897
1898 shift @lines;
1899
1900 while(my $admin = shift @lines) {
1901 push @{$self->{inform}->{admins}}, $admin;
1902 }
1903
1904 while(my $guest = shift @lines) {
1905 push @{$self->{inform}->{guests}}, $guest;
1906 }
1907
1908 while(my $link = shift @lines) {
1909 push @{$self->{inform}->{links}}, $link;
1910 }
1911
1912 return $resbody;
1913
1914 }
1915
1916 ###
1917
1918 =head1 EXPORT
1919
1920 Nothing.
1921
1922 =head1 SEE ALSO
1923
1924 L<http://hyperestraier.sourceforge.net/>
1925
1926 Hyper Estraier Ruby interface on which this module is based.
1927
1928 =head1 AUTHOR
1929
1930 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1931
1932 Robert Klep E<lt>robert@klep.nameE<gt> contributed refactored search code
1933
1934 =head1 COPYRIGHT AND LICENSE
1935
1936 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1937
1938 This library is free software; you can redistribute it and/or modify
1939 it under the GPL v2 or later.
1940
1941 =cut
1942
1943 1;

  ViewVC Help
Powered by ViewVC 1.1.26