/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 126 - (show annotations)
Sat May 6 21:38:14 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 33661 byte(s)
Better implementation of search by Robert Klep <robert@klep.name>
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 =head2 set_skip
603
604 Set number of skipped documents from beginning of results
605
606 $cond->set_skip(42);
607
608 Similar to C<offset> in RDBMS.
609
610 =cut
611
612 sub set_skip {
613 my $self = shift;
614 $self->{skip} = shift;
615 }
616
617 =head2 skip
618
619 Return skip for this condition.
620
621 print $cond->skip;
622
623 =cut
624
625 sub skip {
626 my $self = shift;
627 return $self->{skip};
628 }
629
630
631 package Search::Estraier::ResultDocument;
632
633 use Carp qw/croak/;
634
635 #use Search::Estraier;
636 #our @ISA = qw/Search::Estraier/;
637
638 =head1 Search::Estraier::ResultDocument
639
640 =head2 new
641
642 my $rdoc = new Search::HyperEstraier::ResultDocument(
643 uri => 'http://localhost/document/uri/42',
644 attrs => {
645 foo => 1,
646 bar => 2,
647 },
648 snippet => 'this is a text of snippet'
649 keywords => 'this\tare\tkeywords'
650 );
651
652 =cut
653
654 sub new {
655 my $class = shift;
656 my $self = {@_};
657 bless($self, $class);
658
659 croak "missing uri for ResultDocument" unless defined($self->{uri});
660
661 $self ? return $self : return undef;
662 }
663
664
665 =head2 uri
666
667 Return URI of result document
668
669 print $rdoc->uri;
670
671 =cut
672
673 sub uri {
674 my $self = shift;
675 return $self->{uri};
676 }
677
678
679 =head2 attr_names
680
681 Returns array with attribute names from result document object.
682
683 my @attrs = $rdoc->attr_names;
684
685 =cut
686
687 sub attr_names {
688 my $self = shift;
689 croak "attr_names return array, not scalar" if (! wantarray);
690 return sort keys %{ $self->{attrs} };
691 }
692
693
694 =head2 attr
695
696 Returns value of an attribute.
697
698 my $value = $rdoc->attr( 'attribute' );
699
700 =cut
701
702 sub attr {
703 my $self = shift;
704 my $name = shift || return;
705 return $self->{attrs}->{ $name };
706 }
707
708
709 =head2 snippet
710
711 Return snippet from result document
712
713 print $rdoc->snippet;
714
715 =cut
716
717 sub snippet {
718 my $self = shift;
719 return $self->{snippet};
720 }
721
722
723 =head2 keywords
724
725 Return keywords from result document
726
727 print $rdoc->keywords;
728
729 =cut
730
731 sub keywords {
732 my $self = shift;
733 return $self->{keywords};
734 }
735
736
737 package Search::Estraier::NodeResult;
738
739 use Carp qw/croak/;
740
741 #use Search::Estraier;
742 #our @ISA = qw/Search::Estraier/;
743
744 =head1 Search::Estraier::NodeResult
745
746 =head2 new
747
748 my $res = new Search::HyperEstraier::NodeResult(
749 docs => @array_of_rdocs,
750 hits => %hash_with_hints,
751 );
752
753 =cut
754
755 sub new {
756 my $class = shift;
757 my $self = {@_};
758 bless($self, $class);
759
760 foreach my $f (qw/docs hints/) {
761 croak "missing $f for ResultDocument" unless defined($self->{$f});
762 }
763
764 $self ? return $self : return undef;
765 }
766
767
768 =head2 doc_num
769
770 Return number of documents
771
772 print $res->doc_num;
773
774 This will return real number of documents (limited by C<max>).
775 If you want to get total number of hits, see C<hits>.
776
777 =cut
778
779 sub doc_num {
780 my $self = shift;
781 return $#{$self->{docs}} + 1;
782 }
783
784
785 =head2 get_doc
786
787 Return single document
788
789 my $doc = $res->get_doc( 42 );
790
791 Returns undef if document doesn't exist.
792
793 =cut
794
795 sub get_doc {
796 my $self = shift;
797 my $num = shift;
798 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
799 return undef if ($num < 0 || $num > $self->{docs});
800 return $self->{docs}->[$num];
801 }
802
803
804 =head2 hint
805
806 Return specific hint from results.
807
808 print $res->hint( 'VERSION' );
809
810 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
811 C<TIME>, C<LINK#n>, C<VIEW>.
812
813 =cut
814
815 sub hint {
816 my $self = shift;
817 my $key = shift || return;
818 return $self->{hints}->{$key};
819 }
820
821 =head2 hints
822
823 More perlish version of C<hint>. This one returns hash.
824
825 my %hints = $res->hints;
826
827 =cut
828
829 sub hints {
830 my $self = shift;
831 return $self->{hints};
832 }
833
834 =head2 hits
835
836 Syntaxtic sugar for total number of hits for this query
837
838 print $res->hits;
839
840 It's same as
841
842 print $res->hint('HIT');
843
844 but shorter.
845
846 =cut
847
848 sub hits {
849 my $self = shift;
850 return $self->{hints}->{'HIT'} || 0;
851 }
852
853 package Search::Estraier::Node;
854
855 use Carp qw/carp croak confess/;
856 use URI;
857 use MIME::Base64;
858 use IO::Socket::INET;
859 use URI::Escape qw/uri_escape/;
860
861 =head1 Search::Estraier::Node
862
863 =head2 new
864
865 my $node = new Search::HyperEstraier::Node;
866
867 or optionally with C<url> as parametar
868
869 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
870
871 or in more verbose form
872
873 my $node = new Search::HyperEstraier::Node(
874 url => 'http://localhost:1978/node/test',
875 debug => 1,
876 croak_on_error => 1
877 );
878
879 with following arguments:
880
881 =over 4
882
883 =item url
884
885 URL to node
886
887 =item debug
888
889 dumps a B<lot> of debugging output
890
891 =item croak_on_error
892
893 very helpful during development. It will croak on all errors instead of
894 silently returning C<-1> (which is convention of Hyper Estraier API in other
895 languages).
896
897 =back
898
899 =cut
900
901 sub new {
902 my $class = shift;
903 my $self = {
904 pxport => -1,
905 timeout => 0, # this used to be -1
906 wwidth => 480,
907 hwidth => 96,
908 awidth => 96,
909 status => -1,
910 };
911
912 bless($self, $class);
913
914 if ($#_ == 0) {
915 $self->{url} = shift;
916 } else {
917 my $args = {@_};
918
919 %$self = ( %$self, @_ );
920
921 warn "## Node debug on\n" if ($self->{debug});
922 }
923
924 $self->{inform} = {
925 dnum => -1,
926 wnum => -1,
927 size => -1.0,
928 };
929
930 $self ? return $self : return undef;
931 }
932
933
934 =head2 set_url
935
936 Specify URL to node server
937
938 $node->set_url('http://localhost:1978');
939
940 =cut
941
942 sub set_url {
943 my $self = shift;
944 $self->{url} = shift;
945 }
946
947
948 =head2 set_proxy
949
950 Specify proxy server to connect to node server
951
952 $node->set_proxy('proxy.example.com', 8080);
953
954 =cut
955
956 sub set_proxy {
957 my $self = shift;
958 my ($host,$port) = @_;
959 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
960 $self->{pxhost} = $host;
961 $self->{pxport} = $port;
962 }
963
964
965 =head2 set_timeout
966
967 Specify timeout of connection in seconds
968
969 $node->set_timeout( 15 );
970
971 =cut
972
973 sub set_timeout {
974 my $self = shift;
975 my $sec = shift;
976 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
977 $self->{timeout} = $sec;
978 }
979
980
981 =head2 set_auth
982
983 Specify name and password for authentication to node server.
984
985 $node->set_auth('clint','eastwood');
986
987 =cut
988
989 sub set_auth {
990 my $self = shift;
991 my ($login,$passwd) = @_;
992 my $basic_auth = encode_base64( "$login:$passwd" );
993 chomp($basic_auth);
994 $self->{auth} = $basic_auth;
995 }
996
997
998 =head2 status
999
1000 Return status code of last request.
1001
1002 print $node->status;
1003
1004 C<-1> means connection failure.
1005
1006 =cut
1007
1008 sub status {
1009 my $self = shift;
1010 return $self->{status};
1011 }
1012
1013
1014 =head2 put_doc
1015
1016 Add a document
1017
1018 $node->put_doc( $document_draft ) or die "can't add document";
1019
1020 Return true on success or false on failture.
1021
1022 =cut
1023
1024 sub put_doc {
1025 my $self = shift;
1026 my $doc = shift || return;
1027 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1028 $self->shuttle_url( $self->{url} . '/put_doc',
1029 'text/x-estraier-draft',
1030 $doc->dump_draft,
1031 undef
1032 ) == 200;
1033 }
1034
1035
1036 =head2 out_doc
1037
1038 Remove a document
1039
1040 $node->out_doc( document_id ) or "can't remove document";
1041
1042 Return true on success or false on failture.
1043
1044 =cut
1045
1046 sub out_doc {
1047 my $self = shift;
1048 my $id = shift || return;
1049 return unless ($self->{url});
1050 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1051 $self->shuttle_url( $self->{url} . '/out_doc',
1052 'application/x-www-form-urlencoded',
1053 "id=$id",
1054 undef
1055 ) == 200;
1056 }
1057
1058
1059 =head2 out_doc_by_uri
1060
1061 Remove a registrated document using it's uri
1062
1063 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1064
1065 Return true on success or false on failture.
1066
1067 =cut
1068
1069 sub out_doc_by_uri {
1070 my $self = shift;
1071 my $uri = shift || return;
1072 return unless ($self->{url});
1073 $self->shuttle_url( $self->{url} . '/out_doc',
1074 'application/x-www-form-urlencoded',
1075 "uri=" . uri_escape($uri),
1076 undef
1077 ) == 200;
1078 }
1079
1080
1081 =head2 edit_doc
1082
1083 Edit attributes of a document
1084
1085 $node->edit_doc( $document_draft ) or die "can't edit document";
1086
1087 Return true on success or false on failture.
1088
1089 =cut
1090
1091 sub edit_doc {
1092 my $self = shift;
1093 my $doc = shift || return;
1094 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1095 $self->shuttle_url( $self->{url} . '/edit_doc',
1096 'text/x-estraier-draft',
1097 $doc->dump_draft,
1098 undef
1099 ) == 200;
1100 }
1101
1102
1103 =head2 get_doc
1104
1105 Retreive document
1106
1107 my $doc = $node->get_doc( document_id ) or die "can't get document";
1108
1109 Return true on success or false on failture.
1110
1111 =cut
1112
1113 sub get_doc {
1114 my $self = shift;
1115 my $id = shift || return;
1116 return $self->_fetch_doc( id => $id );
1117 }
1118
1119
1120 =head2 get_doc_by_uri
1121
1122 Retreive document
1123
1124 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1125
1126 Return true on success or false on failture.
1127
1128 =cut
1129
1130 sub get_doc_by_uri {
1131 my $self = shift;
1132 my $uri = shift || return;
1133 return $self->_fetch_doc( uri => $uri );
1134 }
1135
1136
1137 =head2 get_doc_attr
1138
1139 Retrieve the value of an atribute from object
1140
1141 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1142 die "can't get document attribute";
1143
1144 =cut
1145
1146 sub get_doc_attr {
1147 my $self = shift;
1148 my ($id,$name) = @_;
1149 return unless ($id && $name);
1150 return $self->_fetch_doc( id => $id, attr => $name );
1151 }
1152
1153
1154 =head2 get_doc_attr_by_uri
1155
1156 Retrieve the value of an atribute from object
1157
1158 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1159 die "can't get document attribute";
1160
1161 =cut
1162
1163 sub get_doc_attr_by_uri {
1164 my $self = shift;
1165 my ($uri,$name) = @_;
1166 return unless ($uri && $name);
1167 return $self->_fetch_doc( uri => $uri, attr => $name );
1168 }
1169
1170
1171 =head2 etch_doc
1172
1173 Exctract document keywords
1174
1175 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1176
1177 =cut
1178
1179 sub etch_doc {
1180 my $self = shift;
1181 my $id = shift || return;
1182 return $self->_fetch_doc( id => $id, etch => 1 );
1183 }
1184
1185 =head2 etch_doc_by_uri
1186
1187 Retreive document
1188
1189 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1190
1191 Return true on success or false on failture.
1192
1193 =cut
1194
1195 sub etch_doc_by_uri {
1196 my $self = shift;
1197 my $uri = shift || return;
1198 return $self->_fetch_doc( uri => $uri, etch => 1 );
1199 }
1200
1201
1202 =head2 uri_to_id
1203
1204 Get ID of document specified by URI
1205
1206 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1207
1208 This method won't croak, even if using C<croak_on_error>.
1209
1210 =cut
1211
1212 sub uri_to_id {
1213 my $self = shift;
1214 my $uri = shift || return;
1215 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1216 }
1217
1218
1219 =head2 _fetch_doc
1220
1221 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1222 C<etch_doc>, C<etch_doc_by_uri>.
1223
1224 # this will decode received draft into Search::Estraier::Document object
1225 my $doc = $node->_fetch_doc( id => 42 );
1226 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1227
1228 # to extract keywords, add etch
1229 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1230 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1231
1232 # to get document attrubute add attr
1233 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1234 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1235
1236 # more general form which allows implementation of
1237 # uri_to_id
1238 my $id = $node->_fetch_doc(
1239 uri => 'file:///document/uri/42',
1240 path => '/uri_to_id',
1241 chomp_resbody => 1
1242 );
1243
1244 =cut
1245
1246 sub _fetch_doc {
1247 my $self = shift;
1248 my $a = {@_};
1249 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1250
1251 my ($arg, $resbody);
1252
1253 my $path = $a->{path} || '/get_doc';
1254 $path = '/etch_doc' if ($a->{etch});
1255
1256 if ($a->{id}) {
1257 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1258 $arg = 'id=' . $a->{id};
1259 } elsif ($a->{uri}) {
1260 $arg = 'uri=' . uri_escape($a->{uri});
1261 } else {
1262 confess "unhandled argument. Need id or uri.";
1263 }
1264
1265 if ($a->{attr}) {
1266 $path = '/get_doc_attr';
1267 $arg .= '&attr=' . uri_escape($a->{attr});
1268 $a->{chomp_resbody} = 1;
1269 }
1270
1271 my $rv = $self->shuttle_url( $self->{url} . $path,
1272 'application/x-www-form-urlencoded',
1273 $arg,
1274 \$resbody,
1275 $a->{croak_on_error},
1276 );
1277
1278 return if ($rv != 200);
1279
1280 if ($a->{etch}) {
1281 $self->{kwords} = {};
1282 return +{} unless ($resbody);
1283 foreach my $l (split(/\n/, $resbody)) {
1284 my ($k,$v) = split(/\t/, $l, 2);
1285 $self->{kwords}->{$k} = $v if ($v);
1286 }
1287 return $self->{kwords};
1288 } elsif ($a->{chomp_resbody}) {
1289 return unless (defined($resbody));
1290 chomp($resbody);
1291 return $resbody;
1292 } else {
1293 return new Search::Estraier::Document($resbody);
1294 }
1295 }
1296
1297
1298 =head2 name
1299
1300 my $node_name = $node->name;
1301
1302 =cut
1303
1304 sub name {
1305 my $self = shift;
1306 $self->_set_info unless ($self->{inform}->{name});
1307 return $self->{inform}->{name};
1308 }
1309
1310
1311 =head2 label
1312
1313 my $node_label = $node->label;
1314
1315 =cut
1316
1317 sub label {
1318 my $self = shift;
1319 $self->_set_info unless ($self->{inform}->{label});
1320 return $self->{inform}->{label};
1321 }
1322
1323
1324 =head2 doc_num
1325
1326 my $documents_in_node = $node->doc_num;
1327
1328 =cut
1329
1330 sub doc_num {
1331 my $self = shift;
1332 $self->_set_info if ($self->{inform}->{dnum} < 0);
1333 return $self->{inform}->{dnum};
1334 }
1335
1336
1337 =head2 word_num
1338
1339 my $words_in_node = $node->word_num;
1340
1341 =cut
1342
1343 sub word_num {
1344 my $self = shift;
1345 $self->_set_info if ($self->{inform}->{wnum} < 0);
1346 return $self->{inform}->{wnum};
1347 }
1348
1349
1350 =head2 size
1351
1352 my $node_size = $node->size;
1353
1354 =cut
1355
1356 sub size {
1357 my $self = shift;
1358 $self->_set_info if ($self->{inform}->{size} < 0);
1359 return $self->{inform}->{size};
1360 }
1361
1362
1363 =head2 search
1364
1365 Search documents which match condition
1366
1367 my $nres = $node->search( $cond, $depth );
1368
1369 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1370 depth for meta search.
1371
1372 Function results C<Search::Estraier::NodeResult> object.
1373
1374 =cut
1375
1376 sub search {
1377 my $self = shift;
1378 my ($cond, $depth) = @_;
1379 return unless ($cond && defined($depth) && $self->{url});
1380 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1381 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1382
1383 my $resbody;
1384
1385 my $rv = $self->shuttle_url( $self->{url} . '/search',
1386 'application/x-www-form-urlencoded',
1387 $self->cond_to_query( $cond, $depth ),
1388 \$resbody,
1389 );
1390 return if ($rv != 200);
1391
1392 my (@docs, $hints);
1393
1394 my @lines = split(/\n/, $resbody);
1395 return unless (@lines);
1396
1397 my $border = $lines[0];
1398 my $isend = 0;
1399 my $lnum = 1;
1400
1401 while ( $lnum <= $#lines ) {
1402 my $line = $lines[$lnum];
1403 $lnum++;
1404
1405 #warn "## $line\n";
1406 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1407 $isend = $1;
1408 last;
1409 }
1410
1411 if ($line =~ /\t/) {
1412 my ($k,$v) = split(/\t/, $line, 2);
1413 $hints->{$k} = $v;
1414 }
1415 }
1416
1417 my $snum = $lnum;
1418
1419 while( ! $isend && $lnum <= $#lines ) {
1420 my $line = $lines[$lnum];
1421 #warn "# $lnum: $line\n";
1422 $lnum++;
1423
1424 if ($line && $line =~ m/^\Q$border\E/) {
1425 if ($lnum > $snum) {
1426 my $rdattrs;
1427 my $rdvector;
1428 my $rdsnippet;
1429
1430 my $rlnum = $snum;
1431 while ($rlnum < $lnum - 1 ) {
1432 #my $rdline = $self->_s($lines[$rlnum]);
1433 my $rdline = $lines[$rlnum];
1434 $rlnum++;
1435 last unless ($rdline);
1436 if ($rdline =~ /^%/) {
1437 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1438 } elsif($rdline =~ /=/) {
1439 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1440 } else {
1441 confess "invalid format of response";
1442 }
1443 }
1444 while($rlnum < $lnum - 1) {
1445 my $rdline = $lines[$rlnum];
1446 $rlnum++;
1447 $rdsnippet .= "$rdline\n";
1448 }
1449 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1450 if (my $rduri = $rdattrs->{'@uri'}) {
1451 push @docs, new Search::Estraier::ResultDocument(
1452 uri => $rduri,
1453 attrs => $rdattrs,
1454 snippet => $rdsnippet,
1455 keywords => $rdvector,
1456 );
1457 }
1458 }
1459 $snum = $lnum;
1460 #warn "### $line\n";
1461 $isend = 1 if ($line =~ /:END$/);
1462 }
1463
1464 }
1465
1466 if (! $isend) {
1467 warn "received result doesn't have :END\n$resbody";
1468 return;
1469 }
1470
1471 #warn Dumper(\@docs, $hints);
1472
1473 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1474 }
1475
1476 =head2 search_new
1477
1478 Better implementation of search by Robert Klep <robert@klep.name>
1479
1480 =cut
1481
1482 sub search_new {
1483 my $self = shift;
1484 my ($cond, $depth) = @_;
1485 return unless ($cond && defined($depth) && $self->{url});
1486 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1487 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1488
1489 my $resbody;
1490
1491 my $rv = $self->shuttle_url( $self->{url} . '/search',
1492 'application/x-www-form-urlencoded',
1493 $self->cond_to_query( $cond, $depth ),
1494 \$resbody,
1495 );
1496 return if ($rv != 200);
1497
1498 my @records = split /--------\[.*?\]--------(?::END)?\r?\n/, $resbody;
1499 my $hintsText = splice @records, 0, 2; # starts with empty record
1500 my $hints = { $hintsText =~ m/^(.*?)\t(.*?)$/gsm };
1501
1502 # process records
1503 my $docs;
1504 foreach my $record (@records)
1505 {
1506 # split into keys and snippets
1507 my ($keys, $snippet) = $record =~ m/^(.*?)\n\n(.*?)$/s;
1508
1509 # create document hash
1510 my $doc = { $keys =~ m/^(.*?)=(.*?)$/gsm };
1511 $doc->{'@keywords'} = $doc->{keywords};
1512 ($doc->{keywords}) = $keys =~ m/^%VECTOR\t(.*?)$/gm;
1513 $doc->{snippet} = $snippet;
1514
1515 push @$docs, new Search::Estraier::ResultDocument(
1516 attrs => $doc,
1517 uri => $doc->{'@uri'},
1518 snippet => $snippet,
1519 keywords => $doc->{'keywords'},
1520 );
1521 }
1522
1523 return new Search::Estraier::NodeResult( docs => $docs, hints => $hints );
1524 }
1525
1526
1527 =head2 cond_to_query
1528
1529 Return URI encoded string generated from Search::Estraier::Condition
1530
1531 my $args = $node->cond_to_query( $cond, $depth );
1532
1533 =cut
1534
1535 sub cond_to_query {
1536 my $self = shift;
1537
1538 my $cond = shift || return;
1539 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1540 my $depth = shift;
1541
1542 my @args;
1543
1544 if (my $phrase = $cond->phrase) {
1545 push @args, 'phrase=' . uri_escape($phrase);
1546 }
1547
1548 if (my @attrs = $cond->attrs) {
1549 for my $i ( 0 .. $#attrs ) {
1550 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1551 }
1552 }
1553
1554 if (my $order = $cond->order) {
1555 push @args, 'order=' . uri_escape($order);
1556 }
1557
1558 if (my $max = $cond->max) {
1559 push @args, 'max=' . $max;
1560 } else {
1561 push @args, 'max=' . (1 << 30);
1562 }
1563
1564 if (my $options = $cond->options) {
1565 push @args, 'options=' . $options;
1566 }
1567
1568 push @args, 'depth=' . $depth if ($depth);
1569 push @args, 'wwidth=' . $self->{wwidth};
1570 push @args, 'hwidth=' . $self->{hwidth};
1571 push @args, 'awidth=' . $self->{awidth};
1572 push @args, 'skip=' . $self->{skip} if ($self->{skip});
1573
1574 return join('&', @args);
1575 }
1576
1577
1578 =head2 shuttle_url
1579
1580 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1581 master.
1582
1583 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1584
1585 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1586 body will be saved within object.
1587
1588 =cut
1589
1590 use LWP::UserAgent;
1591
1592 sub shuttle_url {
1593 my $self = shift;
1594
1595 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1596
1597 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1598
1599 $self->{status} = -1;
1600
1601 warn "## $url\n" if ($self->{debug});
1602
1603 $url = new URI($url);
1604 if (
1605 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1606 !$url->host || !$url->port || $url->port < 1
1607 ) {
1608 carp "can't parse $url\n";
1609 return -1;
1610 }
1611
1612 my $ua = LWP::UserAgent->new;
1613 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1614
1615 my $req;
1616 if ($reqbody) {
1617 $req = HTTP::Request->new(POST => $url);
1618 } else {
1619 $req = HTTP::Request->new(GET => $url);
1620 }
1621
1622 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1623 $req->headers->header( 'Connection', 'close' );
1624 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1625 $req->content_type( $content_type );
1626
1627 warn $req->headers->as_string,"\n" if ($self->{debug});
1628
1629 if ($reqbody) {
1630 warn "$reqbody\n" if ($self->{debug});
1631 $req->content( $reqbody );
1632 }
1633
1634 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1635
1636 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1637
1638 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1639
1640 if (! $res->is_success) {
1641 if ($croak_on_error) {
1642 croak("can't get $url: ",$res->status_line);
1643 } else {
1644 return -1;
1645 }
1646 }
1647
1648 $$resbody .= $res->content;
1649
1650 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1651
1652 return $self->{status};
1653 }
1654
1655
1656 =head2 set_snippet_width
1657
1658 Set width of snippets in results
1659
1660 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1661
1662 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1663 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1664
1665 C<$hwidth> specified width of strings from beginning of string. Default
1666 value is C<96>. Negative or zero value keep previous value.
1667
1668 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1669 If negative of zero value is provided previous value is kept unchanged.
1670
1671 =cut
1672
1673 sub set_snippet_width {
1674 my $self = shift;
1675
1676 my ($wwidth, $hwidth, $awidth) = @_;
1677 $self->{wwidth} = $wwidth;
1678 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1679 $self->{awidth} = $awidth if ($awidth >= 0);
1680 }
1681
1682
1683 =head2 set_user
1684
1685 Manage users of node
1686
1687 $node->set_user( 'name', $mode );
1688
1689 C<$mode> can be one of:
1690
1691 =over 4
1692
1693 =item 0
1694
1695 delete account
1696
1697 =item 1
1698
1699 set administrative right for user
1700
1701 =item 2
1702
1703 set user account as guest
1704
1705 =back
1706
1707 Return true on success, otherwise false.
1708
1709 =cut
1710
1711 sub set_user {
1712 my $self = shift;
1713 my ($name, $mode) = @_;
1714
1715 return unless ($self->{url});
1716 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1717
1718 $self->shuttle_url( $self->{url} . '/_set_user',
1719 'text/plain',
1720 'name=' . uri_escape($name) . '&mode=' . $mode,
1721 undef
1722 ) == 200;
1723 }
1724
1725
1726 =head2 set_link
1727
1728 Manage node links
1729
1730 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1731
1732 If C<$credit> is negative, link is removed.
1733
1734 =cut
1735
1736 sub set_link {
1737 my $self = shift;
1738 my ($url, $label, $credit) = @_;
1739
1740 return unless ($self->{url});
1741 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1742
1743 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1744 $reqbody .= '&credit=' . $credit if ($credit > 0);
1745
1746 if ($self->shuttle_url( $self->{url} . '/_set_link',
1747 'application/x-www-form-urlencoded',
1748 $reqbody,
1749 undef
1750 ) == 200) {
1751 # refresh node info after adding link
1752 $self->_set_info;
1753 return 1;
1754 }
1755 }
1756
1757 =head2 admins
1758
1759 my @admins = @{ $node->admins };
1760
1761 Return array of users with admin rights on node
1762
1763 =cut
1764
1765 sub admins {
1766 my $self = shift;
1767 $self->_set_info unless ($self->{inform}->{name});
1768 return $self->{inform}->{admins};
1769 }
1770
1771 =head2 guests
1772
1773 my @guests = @{ $node->guests };
1774
1775 Return array of users with guest rights on node
1776
1777 =cut
1778
1779 sub guests {
1780 my $self = shift;
1781 $self->_set_info unless ($self->{inform}->{name});
1782 return $self->{inform}->{guests};
1783 }
1784
1785 =head2 links
1786
1787 my $links = @{ $node->links };
1788
1789 Return array of links for this node
1790
1791 =cut
1792
1793 sub links {
1794 my $self = shift;
1795 $self->_set_info unless ($self->{inform}->{name});
1796 return $self->{inform}->{links};
1797 }
1798
1799
1800 =head1 PRIVATE METHODS
1801
1802 You could call those directly, but you don't have to. I hope.
1803
1804 =head2 _set_info
1805
1806 Set information for node
1807
1808 $node->_set_info;
1809
1810 =cut
1811
1812 sub _set_info {
1813 my $self = shift;
1814
1815 $self->{status} = -1;
1816 return unless ($self->{url});
1817
1818 my $resbody;
1819 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1820 'text/plain',
1821 undef,
1822 \$resbody,
1823 );
1824
1825 return if ($rv != 200 || !$resbody);
1826
1827 my @lines = split(/[\r\n]/,$resbody);
1828
1829 $self->{inform} = {};
1830
1831 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1832 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1833
1834 return $resbody unless (@lines);
1835
1836 shift @lines;
1837
1838 while(my $admin = shift @lines) {
1839 push @{$self->{inform}->{admins}}, $admin;
1840 }
1841
1842 while(my $guest = shift @lines) {
1843 push @{$self->{inform}->{guests}}, $guest;
1844 }
1845
1846 while(my $link = shift @lines) {
1847 push @{$self->{inform}->{links}}, $link;
1848 }
1849
1850 return $resbody;
1851
1852 }
1853
1854 ###
1855
1856 =head1 EXPORT
1857
1858 Nothing.
1859
1860 =head1 SEE ALSO
1861
1862 L<http://hyperestraier.sourceforge.net/>
1863
1864 Hyper Estraier Ruby interface on which this module is based.
1865
1866 =head1 AUTHOR
1867
1868 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1869
1870
1871 =head1 COPYRIGHT AND LICENSE
1872
1873 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1874
1875 This library is free software; you can redistribute it and/or modify
1876 it under the GPL v2 or later.
1877
1878 =cut
1879
1880 1;

  ViewVC Help
Powered by ViewVC 1.1.26