/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 128 - (show annotations)
Mon May 8 12:00:43 2006 UTC (17 years, 10 months ago) by dpavlin
File size: 31382 byte(s)
removed old implementation of search in favor of refactored code contributed by Robert Klep
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 =head2 set_skip
603
604 Set number of skipped documents from beginning of results
605
606 $cond->set_skip(42);
607
608 Similar to C<offset> in RDBMS.
609
610 =cut
611
612 sub set_skip {
613 my $self = shift;
614 $self->{skip} = shift;
615 }
616
617 =head2 skip
618
619 Return skip for this condition.
620
621 print $cond->skip;
622
623 =cut
624
625 sub skip {
626 my $self = shift;
627 return $self->{skip};
628 }
629
630
631 package Search::Estraier::ResultDocument;
632
633 use Carp qw/croak/;
634
635 #use Search::Estraier;
636 #our @ISA = qw/Search::Estraier/;
637
638 =head1 Search::Estraier::ResultDocument
639
640 =head2 new
641
642 my $rdoc = new Search::HyperEstraier::ResultDocument(
643 uri => 'http://localhost/document/uri/42',
644 attrs => {
645 foo => 1,
646 bar => 2,
647 },
648 snippet => 'this is a text of snippet'
649 keywords => 'this\tare\tkeywords'
650 );
651
652 =cut
653
654 sub new {
655 my $class = shift;
656 my $self = {@_};
657 bless($self, $class);
658
659 croak "missing uri for ResultDocument" unless defined($self->{uri});
660
661 $self ? return $self : return undef;
662 }
663
664
665 =head2 uri
666
667 Return URI of result document
668
669 print $rdoc->uri;
670
671 =cut
672
673 sub uri {
674 my $self = shift;
675 return $self->{uri};
676 }
677
678
679 =head2 attr_names
680
681 Returns array with attribute names from result document object.
682
683 my @attrs = $rdoc->attr_names;
684
685 =cut
686
687 sub attr_names {
688 my $self = shift;
689 croak "attr_names return array, not scalar" if (! wantarray);
690 return sort keys %{ $self->{attrs} };
691 }
692
693
694 =head2 attr
695
696 Returns value of an attribute.
697
698 my $value = $rdoc->attr( 'attribute' );
699
700 =cut
701
702 sub attr {
703 my $self = shift;
704 my $name = shift || return;
705 return $self->{attrs}->{ $name };
706 }
707
708
709 =head2 snippet
710
711 Return snippet from result document
712
713 print $rdoc->snippet;
714
715 =cut
716
717 sub snippet {
718 my $self = shift;
719 return $self->{snippet};
720 }
721
722
723 =head2 keywords
724
725 Return keywords from result document
726
727 print $rdoc->keywords;
728
729 =cut
730
731 sub keywords {
732 my $self = shift;
733 return $self->{keywords};
734 }
735
736
737 package Search::Estraier::NodeResult;
738
739 use Carp qw/croak/;
740
741 #use Search::Estraier;
742 #our @ISA = qw/Search::Estraier/;
743
744 =head1 Search::Estraier::NodeResult
745
746 =head2 new
747
748 my $res = new Search::HyperEstraier::NodeResult(
749 docs => @array_of_rdocs,
750 hits => %hash_with_hints,
751 );
752
753 =cut
754
755 sub new {
756 my $class = shift;
757 my $self = {@_};
758 bless($self, $class);
759
760 foreach my $f (qw/docs hints/) {
761 croak "missing $f for ResultDocument" unless defined($self->{$f});
762 }
763
764 $self ? return $self : return undef;
765 }
766
767
768 =head2 doc_num
769
770 Return number of documents
771
772 print $res->doc_num;
773
774 This will return real number of documents (limited by C<max>).
775 If you want to get total number of hits, see C<hits>.
776
777 =cut
778
779 sub doc_num {
780 my $self = shift;
781 return $#{$self->{docs}} + 1;
782 }
783
784
785 =head2 get_doc
786
787 Return single document
788
789 my $doc = $res->get_doc( 42 );
790
791 Returns undef if document doesn't exist.
792
793 =cut
794
795 sub get_doc {
796 my $self = shift;
797 my $num = shift;
798 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
799 return undef if ($num < 0 || $num > $self->{docs});
800 return $self->{docs}->[$num];
801 }
802
803
804 =head2 hint
805
806 Return specific hint from results.
807
808 print $res->hint( 'VERSION' );
809
810 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
811 C<TIME>, C<LINK#n>, C<VIEW>.
812
813 =cut
814
815 sub hint {
816 my $self = shift;
817 my $key = shift || return;
818 return $self->{hints}->{$key};
819 }
820
821 =head2 hints
822
823 More perlish version of C<hint>. This one returns hash.
824
825 my %hints = $res->hints;
826
827 =cut
828
829 sub hints {
830 my $self = shift;
831 return $self->{hints};
832 }
833
834 =head2 hits
835
836 Syntaxtic sugar for total number of hits for this query
837
838 print $res->hits;
839
840 It's same as
841
842 print $res->hint('HIT');
843
844 but shorter.
845
846 =cut
847
848 sub hits {
849 my $self = shift;
850 return $self->{hints}->{'HIT'} || 0;
851 }
852
853 package Search::Estraier::Node;
854
855 use Carp qw/carp croak confess/;
856 use URI;
857 use MIME::Base64;
858 use IO::Socket::INET;
859 use URI::Escape qw/uri_escape/;
860
861 =head1 Search::Estraier::Node
862
863 =head2 new
864
865 my $node = new Search::HyperEstraier::Node;
866
867 or optionally with C<url> as parametar
868
869 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
870
871 or in more verbose form
872
873 my $node = new Search::HyperEstraier::Node(
874 url => 'http://localhost:1978/node/test',
875 debug => 1,
876 croak_on_error => 1
877 );
878
879 with following arguments:
880
881 =over 4
882
883 =item url
884
885 URL to node
886
887 =item debug
888
889 dumps a B<lot> of debugging output
890
891 =item croak_on_error
892
893 very helpful during development. It will croak on all errors instead of
894 silently returning C<-1> (which is convention of Hyper Estraier API in other
895 languages).
896
897 =back
898
899 =cut
900
901 sub new {
902 my $class = shift;
903 my $self = {
904 pxport => -1,
905 timeout => 0, # this used to be -1
906 wwidth => 480,
907 hwidth => 96,
908 awidth => 96,
909 status => -1,
910 };
911
912 bless($self, $class);
913
914 if ($#_ == 0) {
915 $self->{url} = shift;
916 } else {
917 my $args = {@_};
918
919 %$self = ( %$self, @_ );
920
921 warn "## Node debug on\n" if ($self->{debug});
922 }
923
924 $self->{inform} = {
925 dnum => -1,
926 wnum => -1,
927 size => -1.0,
928 };
929
930 $self ? return $self : return undef;
931 }
932
933
934 =head2 set_url
935
936 Specify URL to node server
937
938 $node->set_url('http://localhost:1978');
939
940 =cut
941
942 sub set_url {
943 my $self = shift;
944 $self->{url} = shift;
945 }
946
947
948 =head2 set_proxy
949
950 Specify proxy server to connect to node server
951
952 $node->set_proxy('proxy.example.com', 8080);
953
954 =cut
955
956 sub set_proxy {
957 my $self = shift;
958 my ($host,$port) = @_;
959 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
960 $self->{pxhost} = $host;
961 $self->{pxport} = $port;
962 }
963
964
965 =head2 set_timeout
966
967 Specify timeout of connection in seconds
968
969 $node->set_timeout( 15 );
970
971 =cut
972
973 sub set_timeout {
974 my $self = shift;
975 my $sec = shift;
976 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
977 $self->{timeout} = $sec;
978 }
979
980
981 =head2 set_auth
982
983 Specify name and password for authentication to node server.
984
985 $node->set_auth('clint','eastwood');
986
987 =cut
988
989 sub set_auth {
990 my $self = shift;
991 my ($login,$passwd) = @_;
992 my $basic_auth = encode_base64( "$login:$passwd" );
993 chomp($basic_auth);
994 $self->{auth} = $basic_auth;
995 }
996
997
998 =head2 status
999
1000 Return status code of last request.
1001
1002 print $node->status;
1003
1004 C<-1> means connection failure.
1005
1006 =cut
1007
1008 sub status {
1009 my $self = shift;
1010 return $self->{status};
1011 }
1012
1013
1014 =head2 put_doc
1015
1016 Add a document
1017
1018 $node->put_doc( $document_draft ) or die "can't add document";
1019
1020 Return true on success or false on failture.
1021
1022 =cut
1023
1024 sub put_doc {
1025 my $self = shift;
1026 my $doc = shift || return;
1027 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1028 $self->shuttle_url( $self->{url} . '/put_doc',
1029 'text/x-estraier-draft',
1030 $doc->dump_draft,
1031 undef
1032 ) == 200;
1033 }
1034
1035
1036 =head2 out_doc
1037
1038 Remove a document
1039
1040 $node->out_doc( document_id ) or "can't remove document";
1041
1042 Return true on success or false on failture.
1043
1044 =cut
1045
1046 sub out_doc {
1047 my $self = shift;
1048 my $id = shift || return;
1049 return unless ($self->{url});
1050 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1051 $self->shuttle_url( $self->{url} . '/out_doc',
1052 'application/x-www-form-urlencoded',
1053 "id=$id",
1054 undef
1055 ) == 200;
1056 }
1057
1058
1059 =head2 out_doc_by_uri
1060
1061 Remove a registrated document using it's uri
1062
1063 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1064
1065 Return true on success or false on failture.
1066
1067 =cut
1068
1069 sub out_doc_by_uri {
1070 my $self = shift;
1071 my $uri = shift || return;
1072 return unless ($self->{url});
1073 $self->shuttle_url( $self->{url} . '/out_doc',
1074 'application/x-www-form-urlencoded',
1075 "uri=" . uri_escape($uri),
1076 undef
1077 ) == 200;
1078 }
1079
1080
1081 =head2 edit_doc
1082
1083 Edit attributes of a document
1084
1085 $node->edit_doc( $document_draft ) or die "can't edit document";
1086
1087 Return true on success or false on failture.
1088
1089 =cut
1090
1091 sub edit_doc {
1092 my $self = shift;
1093 my $doc = shift || return;
1094 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1095 $self->shuttle_url( $self->{url} . '/edit_doc',
1096 'text/x-estraier-draft',
1097 $doc->dump_draft,
1098 undef
1099 ) == 200;
1100 }
1101
1102
1103 =head2 get_doc
1104
1105 Retreive document
1106
1107 my $doc = $node->get_doc( document_id ) or die "can't get document";
1108
1109 Return true on success or false on failture.
1110
1111 =cut
1112
1113 sub get_doc {
1114 my $self = shift;
1115 my $id = shift || return;
1116 return $self->_fetch_doc( id => $id );
1117 }
1118
1119
1120 =head2 get_doc_by_uri
1121
1122 Retreive document
1123
1124 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1125
1126 Return true on success or false on failture.
1127
1128 =cut
1129
1130 sub get_doc_by_uri {
1131 my $self = shift;
1132 my $uri = shift || return;
1133 return $self->_fetch_doc( uri => $uri );
1134 }
1135
1136
1137 =head2 get_doc_attr
1138
1139 Retrieve the value of an atribute from object
1140
1141 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1142 die "can't get document attribute";
1143
1144 =cut
1145
1146 sub get_doc_attr {
1147 my $self = shift;
1148 my ($id,$name) = @_;
1149 return unless ($id && $name);
1150 return $self->_fetch_doc( id => $id, attr => $name );
1151 }
1152
1153
1154 =head2 get_doc_attr_by_uri
1155
1156 Retrieve the value of an atribute from object
1157
1158 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1159 die "can't get document attribute";
1160
1161 =cut
1162
1163 sub get_doc_attr_by_uri {
1164 my $self = shift;
1165 my ($uri,$name) = @_;
1166 return unless ($uri && $name);
1167 return $self->_fetch_doc( uri => $uri, attr => $name );
1168 }
1169
1170
1171 =head2 etch_doc
1172
1173 Exctract document keywords
1174
1175 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1176
1177 =cut
1178
1179 sub etch_doc {
1180 my $self = shift;
1181 my $id = shift || return;
1182 return $self->_fetch_doc( id => $id, etch => 1 );
1183 }
1184
1185 =head2 etch_doc_by_uri
1186
1187 Retreive document
1188
1189 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1190
1191 Return true on success or false on failture.
1192
1193 =cut
1194
1195 sub etch_doc_by_uri {
1196 my $self = shift;
1197 my $uri = shift || return;
1198 return $self->_fetch_doc( uri => $uri, etch => 1 );
1199 }
1200
1201
1202 =head2 uri_to_id
1203
1204 Get ID of document specified by URI
1205
1206 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1207
1208 This method won't croak, even if using C<croak_on_error>.
1209
1210 =cut
1211
1212 sub uri_to_id {
1213 my $self = shift;
1214 my $uri = shift || return;
1215 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1216 }
1217
1218
1219 =head2 _fetch_doc
1220
1221 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1222 C<etch_doc>, C<etch_doc_by_uri>.
1223
1224 # this will decode received draft into Search::Estraier::Document object
1225 my $doc = $node->_fetch_doc( id => 42 );
1226 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1227
1228 # to extract keywords, add etch
1229 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1230 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1231
1232 # to get document attrubute add attr
1233 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1234 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1235
1236 # more general form which allows implementation of
1237 # uri_to_id
1238 my $id = $node->_fetch_doc(
1239 uri => 'file:///document/uri/42',
1240 path => '/uri_to_id',
1241 chomp_resbody => 1
1242 );
1243
1244 =cut
1245
1246 sub _fetch_doc {
1247 my $self = shift;
1248 my $a = {@_};
1249 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1250
1251 my ($arg, $resbody);
1252
1253 my $path = $a->{path} || '/get_doc';
1254 $path = '/etch_doc' if ($a->{etch});
1255
1256 if ($a->{id}) {
1257 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1258 $arg = 'id=' . $a->{id};
1259 } elsif ($a->{uri}) {
1260 $arg = 'uri=' . uri_escape($a->{uri});
1261 } else {
1262 confess "unhandled argument. Need id or uri.";
1263 }
1264
1265 if ($a->{attr}) {
1266 $path = '/get_doc_attr';
1267 $arg .= '&attr=' . uri_escape($a->{attr});
1268 $a->{chomp_resbody} = 1;
1269 }
1270
1271 my $rv = $self->shuttle_url( $self->{url} . $path,
1272 'application/x-www-form-urlencoded',
1273 $arg,
1274 \$resbody,
1275 $a->{croak_on_error},
1276 );
1277
1278 return if ($rv != 200);
1279
1280 if ($a->{etch}) {
1281 $self->{kwords} = {};
1282 return +{} unless ($resbody);
1283 foreach my $l (split(/\n/, $resbody)) {
1284 my ($k,$v) = split(/\t/, $l, 2);
1285 $self->{kwords}->{$k} = $v if ($v);
1286 }
1287 return $self->{kwords};
1288 } elsif ($a->{chomp_resbody}) {
1289 return unless (defined($resbody));
1290 chomp($resbody);
1291 return $resbody;
1292 } else {
1293 return new Search::Estraier::Document($resbody);
1294 }
1295 }
1296
1297
1298 =head2 name
1299
1300 my $node_name = $node->name;
1301
1302 =cut
1303
1304 sub name {
1305 my $self = shift;
1306 $self->_set_info unless ($self->{inform}->{name});
1307 return $self->{inform}->{name};
1308 }
1309
1310
1311 =head2 label
1312
1313 my $node_label = $node->label;
1314
1315 =cut
1316
1317 sub label {
1318 my $self = shift;
1319 $self->_set_info unless ($self->{inform}->{label});
1320 return $self->{inform}->{label};
1321 }
1322
1323
1324 =head2 doc_num
1325
1326 my $documents_in_node = $node->doc_num;
1327
1328 =cut
1329
1330 sub doc_num {
1331 my $self = shift;
1332 $self->_set_info if ($self->{inform}->{dnum} < 0);
1333 return $self->{inform}->{dnum};
1334 }
1335
1336
1337 =head2 word_num
1338
1339 my $words_in_node = $node->word_num;
1340
1341 =cut
1342
1343 sub word_num {
1344 my $self = shift;
1345 $self->_set_info if ($self->{inform}->{wnum} < 0);
1346 return $self->{inform}->{wnum};
1347 }
1348
1349
1350 =head2 size
1351
1352 my $node_size = $node->size;
1353
1354 =cut
1355
1356 sub size {
1357 my $self = shift;
1358 $self->_set_info if ($self->{inform}->{size} < 0);
1359 return $self->{inform}->{size};
1360 }
1361
1362
1363 =head2 search
1364
1365 Search documents which match condition
1366
1367 my $nres = $node->search( $cond, $depth );
1368
1369 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1370 depth for meta search.
1371
1372 Function results C<Search::Estraier::NodeResult> object.
1373
1374 =cut
1375
1376 sub search {
1377 my $self = shift;
1378 my ($cond, $depth) = @_;
1379 return unless ($cond && defined($depth) && $self->{url});
1380 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1381 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1382
1383 my $resbody;
1384
1385 my $rv = $self->shuttle_url( $self->{url} . '/search',
1386 'application/x-www-form-urlencoded',
1387 $self->cond_to_query( $cond, $depth ),
1388 \$resbody,
1389 );
1390 return if ($rv != 200);
1391
1392 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1393 my $hintsText = splice @records, 0, 2; # starts with empty record
1394 my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1395
1396 # process records
1397 my $docs = [];
1398 foreach my $record (@records)
1399 {
1400 # split into keys and snippets
1401 my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1402
1403 # create document hash
1404 my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1405 $doc->{'@keywords'} = $doc->{keywords};
1406 ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1407 $doc->{snippet} = $snippet;
1408
1409 push @$docs, new Search::Estraier::ResultDocument(
1410 attrs => $doc,
1411 uri => $doc->{'@uri'},
1412 snippet => $snippet,
1413 keywords => $doc->{'keywords'},
1414 );
1415 }
1416
1417 return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1418 }
1419
1420
1421 =head2 cond_to_query
1422
1423 Return URI encoded string generated from Search::Estraier::Condition
1424
1425 my $args = $node->cond_to_query( $cond, $depth );
1426
1427 =cut
1428
1429 sub cond_to_query {
1430 my $self = shift;
1431
1432 my $cond = shift || return;
1433 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1434 my $depth = shift;
1435
1436 my @args;
1437
1438 if (my $phrase = $cond->phrase) {
1439 push @args, 'phrase=' . uri_escape($phrase);
1440 }
1441
1442 if (my @attrs = $cond->attrs) {
1443 for my $i ( 0 .. $#attrs ) {
1444 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1445 }
1446 }
1447
1448 if (my $order = $cond->order) {
1449 push @args, 'order=' . uri_escape($order);
1450 }
1451
1452 if (my $max = $cond->max) {
1453 push @args, 'max=' . $max;
1454 } else {
1455 push @args, 'max=' . (1 << 30);
1456 }
1457
1458 if (my $options = $cond->options) {
1459 push @args, 'options=' . $options;
1460 }
1461
1462 push @args, 'depth=' . $depth if ($depth);
1463 push @args, 'wwidth=' . $self->{wwidth};
1464 push @args, 'hwidth=' . $self->{hwidth};
1465 push @args, 'awidth=' . $self->{awidth};
1466 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1467
1468 return join('&', @args);
1469 }
1470
1471
1472 =head2 shuttle_url
1473
1474 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1475 master.
1476
1477 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1478
1479 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1480 body will be saved within object.
1481
1482 =cut
1483
1484 use LWP::UserAgent;
1485
1486 sub shuttle_url {
1487 my $self = shift;
1488
1489 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1490
1491 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1492
1493 $self->{status} = -1;
1494
1495 warn "## $url\n" if ($self->{debug});
1496
1497 $url = new URI($url);
1498 if (
1499 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1500 !$url->host || !$url->port || $url->port < 1
1501 ) {
1502 carp "can't parse $url\n";
1503 return -1;
1504 }
1505
1506 my $ua = LWP::UserAgent->new;
1507 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1508
1509 my $req;
1510 if ($reqbody) {
1511 $req = HTTP::Request->new(POST => $url);
1512 } else {
1513 $req = HTTP::Request->new(GET => $url);
1514 }
1515
1516 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1517 $req->headers->header( 'Connection', 'close' );
1518 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1519 $req->content_type( $content_type );
1520
1521 warn $req->headers->as_string,"\n" if ($self->{debug});
1522
1523 if ($reqbody) {
1524 warn "$reqbody\n" if ($self->{debug});
1525 $req->content( $reqbody );
1526 }
1527
1528 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1529
1530 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1531
1532 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1533
1534 if (! $res->is_success) {
1535 if ($croak_on_error) {
1536 croak("can't get $url: ",$res->status_line);
1537 } else {
1538 return -1;
1539 }
1540 }
1541
1542 $$resbody .= $res->content;
1543
1544 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1545
1546 return $self->{status};
1547 }
1548
1549
1550 =head2 set_snippet_width
1551
1552 Set width of snippets in results
1553
1554 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1555
1556 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1557 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1558
1559 C<$hwidth> specified width of strings from beginning of string. Default
1560 value is C<96>. Negative or zero value keep previous value.
1561
1562 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1563 If negative of zero value is provided previous value is kept unchanged.
1564
1565 =cut
1566
1567 sub set_snippet_width {
1568 my $self = shift;
1569
1570 my ($wwidth, $hwidth, $awidth) = @_;
1571 $self->{wwidth} = $wwidth;
1572 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1573 $self->{awidth} = $awidth if ($awidth >= 0);
1574 }
1575
1576
1577 =head2 set_user
1578
1579 Manage users of node
1580
1581 $node->set_user( 'name', $mode );
1582
1583 C<$mode> can be one of:
1584
1585 =over 4
1586
1587 =item 0
1588
1589 delete account
1590
1591 =item 1
1592
1593 set administrative right for user
1594
1595 =item 2
1596
1597 set user account as guest
1598
1599 =back
1600
1601 Return true on success, otherwise false.
1602
1603 =cut
1604
1605 sub set_user {
1606 my $self = shift;
1607 my ($name, $mode) = @_;
1608
1609 return unless ($self->{url});
1610 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1611
1612 $self->shuttle_url( $self->{url} . '/_set_user',
1613 'text/plain',
1614 'name=' . uri_escape($name) . '&mode=' . $mode,
1615 undef
1616 ) == 200;
1617 }
1618
1619
1620 =head2 set_link
1621
1622 Manage node links
1623
1624 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1625
1626 If C<$credit> is negative, link is removed.
1627
1628 =cut
1629
1630 sub set_link {
1631 my $self = shift;
1632 my ($url, $label, $credit) = @_;
1633
1634 return unless ($self->{url});
1635 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1636
1637 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1638 $reqbody .= '&credit=' . $credit if ($credit > 0);
1639
1640 if ($self->shuttle_url( $self->{url} . '/_set_link',
1641 'application/x-www-form-urlencoded',
1642 $reqbody,
1643 undef
1644 ) == 200) {
1645 # refresh node info after adding link
1646 $self->_set_info;
1647 return 1;
1648 }
1649 }
1650
1651 =head2 admins
1652
1653 my @admins = @{ $node->admins };
1654
1655 Return array of users with admin rights on node
1656
1657 =cut
1658
1659 sub admins {
1660 my $self = shift;
1661 $self->_set_info unless ($self->{inform}->{name});
1662 return $self->{inform}->{admins};
1663 }
1664
1665 =head2 guests
1666
1667 my @guests = @{ $node->guests };
1668
1669 Return array of users with guest rights on node
1670
1671 =cut
1672
1673 sub guests {
1674 my $self = shift;
1675 $self->_set_info unless ($self->{inform}->{name});
1676 return $self->{inform}->{guests};
1677 }
1678
1679 =head2 links
1680
1681 my $links = @{ $node->links };
1682
1683 Return array of links for this node
1684
1685 =cut
1686
1687 sub links {
1688 my $self = shift;
1689 $self->_set_info unless ($self->{inform}->{name});
1690 return $self->{inform}->{links};
1691 }
1692
1693
1694 =head1 PRIVATE METHODS
1695
1696 You could call those directly, but you don't have to. I hope.
1697
1698 =head2 _set_info
1699
1700 Set information for node
1701
1702 $node->_set_info;
1703
1704 =cut
1705
1706 sub _set_info {
1707 my $self = shift;
1708
1709 $self->{status} = -1;
1710 return unless ($self->{url});
1711
1712 my $resbody;
1713 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1714 'text/plain',
1715 undef,
1716 \$resbody,
1717 );
1718
1719 return if ($rv != 200 || !$resbody);
1720
1721 my @lines = split(/[\r\n]/,$resbody);
1722
1723 $self->{inform} = {};
1724
1725 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1726 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1727
1728 return $resbody unless (@lines);
1729
1730 shift @lines;
1731
1732 while(my $admin = shift @lines) {
1733 push @{$self->{inform}->{admins}}, $admin;
1734 }
1735
1736 while(my $guest = shift @lines) {
1737 push @{$self->{inform}->{guests}}, $guest;
1738 }
1739
1740 while(my $link = shift @lines) {
1741 push @{$self->{inform}->{links}}, $link;
1742 }
1743
1744 return $resbody;
1745
1746 }
1747
1748 ###
1749
1750 =head1 EXPORT
1751
1752 Nothing.
1753
1754 =head1 SEE ALSO
1755
1756 L<http://hyperestraier.sourceforge.net/>
1757
1758 Hyper Estraier Ruby interface on which this module is based.
1759
1760 =head1 AUTHOR
1761
1762 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1763
1764 Robert Klep E<lt>robert@klep.nameE<gt> contributed refactored search code
1765
1766 =head1 COPYRIGHT AND LICENSE
1767
1768 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1769
1770 This library is free software; you can redistribute it and/or modify
1771 it under the GPL v2 or later.
1772
1773 =cut
1774
1775 1;

  ViewVC Help
Powered by ViewVC 1.1.26