/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 112 - (show annotations)
Sun Mar 12 15:20:06 2006 UTC (18 years ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 31869 byte(s)
version 0.04 ready for CPAN
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 package Search::Estraier::ResultDocument;
603
604 use Carp qw/croak/;
605
606 #use Search::Estraier;
607 #our @ISA = qw/Search::Estraier/;
608
609 =head1 Search::Estraier::ResultDocument
610
611 =head2 new
612
613 my $rdoc = new Search::HyperEstraier::ResultDocument(
614 uri => 'http://localhost/document/uri/42',
615 attrs => {
616 foo => 1,
617 bar => 2,
618 },
619 snippet => 'this is a text of snippet'
620 keywords => 'this\tare\tkeywords'
621 );
622
623 =cut
624
625 sub new {
626 my $class = shift;
627 my $self = {@_};
628 bless($self, $class);
629
630 croak "missing uri for ResultDocument" unless defined($self->{uri});
631
632 $self ? return $self : return undef;
633 }
634
635
636 =head2 uri
637
638 Return URI of result document
639
640 print $rdoc->uri;
641
642 =cut
643
644 sub uri {
645 my $self = shift;
646 return $self->{uri};
647 }
648
649
650 =head2 attr_names
651
652 Returns array with attribute names from result document object.
653
654 my @attrs = $rdoc->attr_names;
655
656 =cut
657
658 sub attr_names {
659 my $self = shift;
660 croak "attr_names return array, not scalar" if (! wantarray);
661 return sort keys %{ $self->{attrs} };
662 }
663
664
665 =head2 attr
666
667 Returns value of an attribute.
668
669 my $value = $rdoc->attr( 'attribute' );
670
671 =cut
672
673 sub attr {
674 my $self = shift;
675 my $name = shift || return;
676 return $self->{attrs}->{ $name };
677 }
678
679
680 =head2 snippet
681
682 Return snippet from result document
683
684 print $rdoc->snippet;
685
686 =cut
687
688 sub snippet {
689 my $self = shift;
690 return $self->{snippet};
691 }
692
693
694 =head2 keywords
695
696 Return keywords from result document
697
698 print $rdoc->keywords;
699
700 =cut
701
702 sub keywords {
703 my $self = shift;
704 return $self->{keywords};
705 }
706
707
708 package Search::Estraier::NodeResult;
709
710 use Carp qw/croak/;
711
712 #use Search::Estraier;
713 #our @ISA = qw/Search::Estraier/;
714
715 =head1 Search::Estraier::NodeResult
716
717 =head2 new
718
719 my $res = new Search::HyperEstraier::NodeResult(
720 docs => @array_of_rdocs,
721 hits => %hash_with_hints,
722 );
723
724 =cut
725
726 sub new {
727 my $class = shift;
728 my $self = {@_};
729 bless($self, $class);
730
731 foreach my $f (qw/docs hints/) {
732 croak "missing $f for ResultDocument" unless defined($self->{$f});
733 }
734
735 $self ? return $self : return undef;
736 }
737
738
739 =head2 doc_num
740
741 Return number of documents
742
743 print $res->doc_num;
744
745 This will return real number of documents (limited by C<max>).
746 If you want to get total number of hits, see C<hits>.
747
748 =cut
749
750 sub doc_num {
751 my $self = shift;
752 return $#{$self->{docs}} + 1;
753 }
754
755
756 =head2 get_doc
757
758 Return single document
759
760 my $doc = $res->get_doc( 42 );
761
762 Returns undef if document doesn't exist.
763
764 =cut
765
766 sub get_doc {
767 my $self = shift;
768 my $num = shift;
769 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
770 return undef if ($num < 0 || $num > $self->{docs});
771 return $self->{docs}->[$num];
772 }
773
774
775 =head2 hint
776
777 Return specific hint from results.
778
779 print $res->hint( 'VERSION' );
780
781 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
782 C<TIME>, C<LINK#n>, C<VIEW>.
783
784 =cut
785
786 sub hint {
787 my $self = shift;
788 my $key = shift || return;
789 return $self->{hints}->{$key};
790 }
791
792 =head2 hints
793
794 More perlish version of C<hint>. This one returns hash.
795
796 my %hints = $res->hints;
797
798 =cut
799
800 sub hints {
801 my $self = shift;
802 return $self->{hints};
803 }
804
805 =head2 hits
806
807 Syntaxtic sugar for total number of hits for this query
808
809 print $res->hits;
810
811 It's same as
812
813 print $res->hint('HIT');
814
815 but shorter.
816
817 =cut
818
819 sub hits {
820 my $self = shift;
821 return $self->{hints}->{'HIT'} || 0;
822 }
823
824 package Search::Estraier::Node;
825
826 use Carp qw/carp croak confess/;
827 use URI;
828 use MIME::Base64;
829 use IO::Socket::INET;
830 use URI::Escape qw/uri_escape/;
831
832 =head1 Search::Estraier::Node
833
834 =head2 new
835
836 my $node = new Search::HyperEstraier::Node;
837
838 or optionally with C<url> as parametar
839
840 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
841
842 or in more verbose form
843
844 my $node = new Search::HyperEstraier::Node(
845 url => 'http://localhost:1978/node/test',
846 debug => 1,
847 croak_on_error => 1
848 );
849
850 with following arguments:
851
852 =over 4
853
854 =item url
855
856 URL to node
857
858 =item debug
859
860 dumps a B<lot> of debugging output
861
862 =item croak_on_error
863
864 very helpful during development. It will croak on all errors instead of
865 silently returning C<-1> (which is convention of Hyper Estraier API in other
866 languages).
867
868 =back
869
870 =cut
871
872 sub new {
873 my $class = shift;
874 my $self = {
875 pxport => -1,
876 timeout => 0, # this used to be -1
877 wwidth => 480,
878 hwidth => 96,
879 awidth => 96,
880 status => -1,
881 };
882
883 bless($self, $class);
884
885 if ($#_ == 0) {
886 $self->{url} = shift;
887 } else {
888 my $args = {@_};
889
890 %$self = ( %$self, @_ );
891
892 warn "## Node debug on\n" if ($self->{debug});
893 }
894
895 $self->{inform} = {
896 dnum => -1,
897 wnum => -1,
898 size => -1.0,
899 };
900
901 $self ? return $self : return undef;
902 }
903
904
905 =head2 set_url
906
907 Specify URL to node server
908
909 $node->set_url('http://localhost:1978');
910
911 =cut
912
913 sub set_url {
914 my $self = shift;
915 $self->{url} = shift;
916 }
917
918
919 =head2 set_proxy
920
921 Specify proxy server to connect to node server
922
923 $node->set_proxy('proxy.example.com', 8080);
924
925 =cut
926
927 sub set_proxy {
928 my $self = shift;
929 my ($host,$port) = @_;
930 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
931 $self->{pxhost} = $host;
932 $self->{pxport} = $port;
933 }
934
935
936 =head2 set_timeout
937
938 Specify timeout of connection in seconds
939
940 $node->set_timeout( 15 );
941
942 =cut
943
944 sub set_timeout {
945 my $self = shift;
946 my $sec = shift;
947 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
948 $self->{timeout} = $sec;
949 }
950
951
952 =head2 set_auth
953
954 Specify name and password for authentication to node server.
955
956 $node->set_auth('clint','eastwood');
957
958 =cut
959
960 sub set_auth {
961 my $self = shift;
962 my ($login,$passwd) = @_;
963 my $basic_auth = encode_base64( "$login:$passwd" );
964 chomp($basic_auth);
965 $self->{auth} = $basic_auth;
966 }
967
968
969 =head2 status
970
971 Return status code of last request.
972
973 print $node->status;
974
975 C<-1> means connection failure.
976
977 =cut
978
979 sub status {
980 my $self = shift;
981 return $self->{status};
982 }
983
984
985 =head2 put_doc
986
987 Add a document
988
989 $node->put_doc( $document_draft ) or die "can't add document";
990
991 Return true on success or false on failture.
992
993 =cut
994
995 sub put_doc {
996 my $self = shift;
997 my $doc = shift || return;
998 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
999 $self->shuttle_url( $self->{url} . '/put_doc',
1000 'text/x-estraier-draft',
1001 $doc->dump_draft,
1002 undef
1003 ) == 200;
1004 }
1005
1006
1007 =head2 out_doc
1008
1009 Remove a document
1010
1011 $node->out_doc( document_id ) or "can't remove document";
1012
1013 Return true on success or false on failture.
1014
1015 =cut
1016
1017 sub out_doc {
1018 my $self = shift;
1019 my $id = shift || return;
1020 return unless ($self->{url});
1021 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1022 $self->shuttle_url( $self->{url} . '/out_doc',
1023 'application/x-www-form-urlencoded',
1024 "id=$id",
1025 undef
1026 ) == 200;
1027 }
1028
1029
1030 =head2 out_doc_by_uri
1031
1032 Remove a registrated document using it's uri
1033
1034 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1035
1036 Return true on success or false on failture.
1037
1038 =cut
1039
1040 sub out_doc_by_uri {
1041 my $self = shift;
1042 my $uri = shift || return;
1043 return unless ($self->{url});
1044 $self->shuttle_url( $self->{url} . '/out_doc',
1045 'application/x-www-form-urlencoded',
1046 "uri=" . uri_escape($uri),
1047 undef
1048 ) == 200;
1049 }
1050
1051
1052 =head2 edit_doc
1053
1054 Edit attributes of a document
1055
1056 $node->edit_doc( $document_draft ) or die "can't edit document";
1057
1058 Return true on success or false on failture.
1059
1060 =cut
1061
1062 sub edit_doc {
1063 my $self = shift;
1064 my $doc = shift || return;
1065 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1066 $self->shuttle_url( $self->{url} . '/edit_doc',
1067 'text/x-estraier-draft',
1068 $doc->dump_draft,
1069 undef
1070 ) == 200;
1071 }
1072
1073
1074 =head2 get_doc
1075
1076 Retreive document
1077
1078 my $doc = $node->get_doc( document_id ) or die "can't get document";
1079
1080 Return true on success or false on failture.
1081
1082 =cut
1083
1084 sub get_doc {
1085 my $self = shift;
1086 my $id = shift || return;
1087 return $self->_fetch_doc( id => $id );
1088 }
1089
1090
1091 =head2 get_doc_by_uri
1092
1093 Retreive document
1094
1095 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1096
1097 Return true on success or false on failture.
1098
1099 =cut
1100
1101 sub get_doc_by_uri {
1102 my $self = shift;
1103 my $uri = shift || return;
1104 return $self->_fetch_doc( uri => $uri );
1105 }
1106
1107
1108 =head2 get_doc_attr
1109
1110 Retrieve the value of an atribute from object
1111
1112 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1113 die "can't get document attribute";
1114
1115 =cut
1116
1117 sub get_doc_attr {
1118 my $self = shift;
1119 my ($id,$name) = @_;
1120 return unless ($id && $name);
1121 return $self->_fetch_doc( id => $id, attr => $name );
1122 }
1123
1124
1125 =head2 get_doc_attr_by_uri
1126
1127 Retrieve the value of an atribute from object
1128
1129 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1130 die "can't get document attribute";
1131
1132 =cut
1133
1134 sub get_doc_attr_by_uri {
1135 my $self = shift;
1136 my ($uri,$name) = @_;
1137 return unless ($uri && $name);
1138 return $self->_fetch_doc( uri => $uri, attr => $name );
1139 }
1140
1141
1142 =head2 etch_doc
1143
1144 Exctract document keywords
1145
1146 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1147
1148 =cut
1149
1150 sub etch_doc {
1151 my $self = shift;
1152 my $id = shift || return;
1153 return $self->_fetch_doc( id => $id, etch => 1 );
1154 }
1155
1156 =head2 etch_doc_by_uri
1157
1158 Retreive document
1159
1160 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1161
1162 Return true on success or false on failture.
1163
1164 =cut
1165
1166 sub etch_doc_by_uri {
1167 my $self = shift;
1168 my $uri = shift || return;
1169 return $self->_fetch_doc( uri => $uri, etch => 1 );
1170 }
1171
1172
1173 =head2 uri_to_id
1174
1175 Get ID of document specified by URI
1176
1177 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1178
1179 This method won't croak, even if using C<croak_on_error>.
1180
1181 =cut
1182
1183 sub uri_to_id {
1184 my $self = shift;
1185 my $uri = shift || return;
1186 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1187 }
1188
1189
1190 =head2 _fetch_doc
1191
1192 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1193 C<etch_doc>, C<etch_doc_by_uri>.
1194
1195 # this will decode received draft into Search::Estraier::Document object
1196 my $doc = $node->_fetch_doc( id => 42 );
1197 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1198
1199 # to extract keywords, add etch
1200 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1201 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1202
1203 # to get document attrubute add attr
1204 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1205 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1206
1207 # more general form which allows implementation of
1208 # uri_to_id
1209 my $id = $node->_fetch_doc(
1210 uri => 'file:///document/uri/42',
1211 path => '/uri_to_id',
1212 chomp_resbody => 1
1213 );
1214
1215 =cut
1216
1217 sub _fetch_doc {
1218 my $self = shift;
1219 my $a = {@_};
1220 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1221
1222 my ($arg, $resbody);
1223
1224 my $path = $a->{path} || '/get_doc';
1225 $path = '/etch_doc' if ($a->{etch});
1226
1227 if ($a->{id}) {
1228 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1229 $arg = 'id=' . $a->{id};
1230 } elsif ($a->{uri}) {
1231 $arg = 'uri=' . uri_escape($a->{uri});
1232 } else {
1233 confess "unhandled argument. Need id or uri.";
1234 }
1235
1236 if ($a->{attr}) {
1237 $path = '/get_doc_attr';
1238 $arg .= '&attr=' . uri_escape($a->{attr});
1239 $a->{chomp_resbody} = 1;
1240 }
1241
1242 my $rv = $self->shuttle_url( $self->{url} . $path,
1243 'application/x-www-form-urlencoded',
1244 $arg,
1245 \$resbody,
1246 $a->{croak_on_error},
1247 );
1248
1249 return if ($rv != 200);
1250
1251 if ($a->{etch}) {
1252 $self->{kwords} = {};
1253 return +{} unless ($resbody);
1254 foreach my $l (split(/\n/, $resbody)) {
1255 my ($k,$v) = split(/\t/, $l, 2);
1256 $self->{kwords}->{$k} = $v if ($v);
1257 }
1258 return $self->{kwords};
1259 } elsif ($a->{chomp_resbody}) {
1260 return unless (defined($resbody));
1261 chomp($resbody);
1262 return $resbody;
1263 } else {
1264 return new Search::Estraier::Document($resbody);
1265 }
1266 }
1267
1268
1269 =head2 name
1270
1271 my $node_name = $node->name;
1272
1273 =cut
1274
1275 sub name {
1276 my $self = shift;
1277 $self->_set_info unless ($self->{inform}->{name});
1278 return $self->{inform}->{name};
1279 }
1280
1281
1282 =head2 label
1283
1284 my $node_label = $node->label;
1285
1286 =cut
1287
1288 sub label {
1289 my $self = shift;
1290 $self->_set_info unless ($self->{inform}->{label});
1291 return $self->{inform}->{label};
1292 }
1293
1294
1295 =head2 doc_num
1296
1297 my $documents_in_node = $node->doc_num;
1298
1299 =cut
1300
1301 sub doc_num {
1302 my $self = shift;
1303 $self->_set_info if ($self->{inform}->{dnum} < 0);
1304 return $self->{inform}->{dnum};
1305 }
1306
1307
1308 =head2 word_num
1309
1310 my $words_in_node = $node->word_num;
1311
1312 =cut
1313
1314 sub word_num {
1315 my $self = shift;
1316 $self->_set_info if ($self->{inform}->{wnum} < 0);
1317 return $self->{inform}->{wnum};
1318 }
1319
1320
1321 =head2 size
1322
1323 my $node_size = $node->size;
1324
1325 =cut
1326
1327 sub size {
1328 my $self = shift;
1329 $self->_set_info if ($self->{inform}->{size} < 0);
1330 return $self->{inform}->{size};
1331 }
1332
1333
1334 =head2 search
1335
1336 Search documents which match condition
1337
1338 my $nres = $node->search( $cond, $depth );
1339
1340 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1341 depth for meta search.
1342
1343 Function results C<Search::Estraier::NodeResult> object.
1344
1345 =cut
1346
1347 sub search {
1348 my $self = shift;
1349 my ($cond, $depth) = @_;
1350 return unless ($cond && defined($depth) && $self->{url});
1351 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1352 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1353
1354 my $resbody;
1355
1356 my $rv = $self->shuttle_url( $self->{url} . '/search',
1357 'application/x-www-form-urlencoded',
1358 $self->cond_to_query( $cond, $depth ),
1359 \$resbody,
1360 );
1361 return if ($rv != 200);
1362
1363 my (@docs, $hints);
1364
1365 my @lines = split(/\n/, $resbody);
1366 return unless (@lines);
1367
1368 my $border = $lines[0];
1369 my $isend = 0;
1370 my $lnum = 1;
1371
1372 while ( $lnum <= $#lines ) {
1373 my $line = $lines[$lnum];
1374 $lnum++;
1375
1376 #warn "## $line\n";
1377 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1378 $isend = $1;
1379 last;
1380 }
1381
1382 if ($line =~ /\t/) {
1383 my ($k,$v) = split(/\t/, $line, 2);
1384 $hints->{$k} = $v;
1385 }
1386 }
1387
1388 my $snum = $lnum;
1389
1390 while( ! $isend && $lnum <= $#lines ) {
1391 my $line = $lines[$lnum];
1392 #warn "# $lnum: $line\n";
1393 $lnum++;
1394
1395 if ($line && $line =~ m/^\Q$border\E/) {
1396 if ($lnum > $snum) {
1397 my $rdattrs;
1398 my $rdvector;
1399 my $rdsnippet;
1400
1401 my $rlnum = $snum;
1402 while ($rlnum < $lnum - 1 ) {
1403 #my $rdline = $self->_s($lines[$rlnum]);
1404 my $rdline = $lines[$rlnum];
1405 $rlnum++;
1406 last unless ($rdline);
1407 if ($rdline =~ /^%/) {
1408 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1409 } elsif($rdline =~ /=/) {
1410 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1411 } else {
1412 confess "invalid format of response";
1413 }
1414 }
1415 while($rlnum < $lnum - 1) {
1416 my $rdline = $lines[$rlnum];
1417 $rlnum++;
1418 $rdsnippet .= "$rdline\n";
1419 }
1420 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1421 if (my $rduri = $rdattrs->{'@uri'}) {
1422 push @docs, new Search::Estraier::ResultDocument(
1423 uri => $rduri,
1424 attrs => $rdattrs,
1425 snippet => $rdsnippet,
1426 keywords => $rdvector,
1427 );
1428 }
1429 }
1430 $snum = $lnum;
1431 #warn "### $line\n";
1432 $isend = 1 if ($line =~ /:END$/);
1433 }
1434
1435 }
1436
1437 if (! $isend) {
1438 warn "received result doesn't have :END\n$resbody";
1439 return;
1440 }
1441
1442 #warn Dumper(\@docs, $hints);
1443
1444 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1445 }
1446
1447
1448 =head2 cond_to_query
1449
1450 Return URI encoded string generated from Search::Estraier::Condition
1451
1452 my $args = $node->cond_to_query( $cond, $depth );
1453
1454 =cut
1455
1456 sub cond_to_query {
1457 my $self = shift;
1458
1459 my $cond = shift || return;
1460 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1461 my $depth = shift;
1462
1463 my @args;
1464
1465 if (my $phrase = $cond->phrase) {
1466 push @args, 'phrase=' . uri_escape($phrase);
1467 }
1468
1469 if (my @attrs = $cond->attrs) {
1470 for my $i ( 0 .. $#attrs ) {
1471 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1472 }
1473 }
1474
1475 if (my $order = $cond->order) {
1476 push @args, 'order=' . uri_escape($order);
1477 }
1478
1479 if (my $max = $cond->max) {
1480 push @args, 'max=' . $max;
1481 } else {
1482 push @args, 'max=' . (1 << 30);
1483 }
1484
1485 if (my $options = $cond->options) {
1486 push @args, 'options=' . $options;
1487 }
1488
1489 push @args, 'depth=' . $depth if ($depth);
1490 push @args, 'wwidth=' . $self->{wwidth};
1491 push @args, 'hwidth=' . $self->{hwidth};
1492 push @args, 'awidth=' . $self->{awidth};
1493
1494 return join('&', @args);
1495 }
1496
1497
1498 =head2 shuttle_url
1499
1500 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1501 master.
1502
1503 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1504
1505 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1506 body will be saved within object.
1507
1508 =cut
1509
1510 use LWP::UserAgent;
1511
1512 sub shuttle_url {
1513 my $self = shift;
1514
1515 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1516
1517 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1518
1519 $self->{status} = -1;
1520
1521 warn "## $url\n" if ($self->{debug});
1522
1523 $url = new URI($url);
1524 if (
1525 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1526 !$url->host || !$url->port || $url->port < 1
1527 ) {
1528 carp "can't parse $url\n";
1529 return -1;
1530 }
1531
1532 my $ua = LWP::UserAgent->new;
1533 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1534
1535 my $req;
1536 if ($reqbody) {
1537 $req = HTTP::Request->new(POST => $url);
1538 } else {
1539 $req = HTTP::Request->new(GET => $url);
1540 }
1541
1542 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1543 $req->headers->header( 'Connection', 'close' );
1544 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1545 $req->content_type( $content_type );
1546
1547 warn $req->headers->as_string,"\n" if ($self->{debug});
1548
1549 if ($reqbody) {
1550 warn "$reqbody\n" if ($self->{debug});
1551 $req->content( $reqbody );
1552 }
1553
1554 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1555
1556 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1557
1558 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1559
1560 if (! $res->is_success) {
1561 if ($croak_on_error) {
1562 croak("can't get $url: ",$res->status_line);
1563 } else {
1564 return -1;
1565 }
1566 }
1567
1568 $$resbody .= $res->content;
1569
1570 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1571
1572 return $self->{status};
1573 }
1574
1575
1576 =head2 set_snippet_width
1577
1578 Set width of snippets in results
1579
1580 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1581
1582 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1583 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1584
1585 C<$hwidth> specified width of strings from beginning of string. Default
1586 value is C<96>. Negative or zero value keep previous value.
1587
1588 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1589 If negative of zero value is provided previous value is kept unchanged.
1590
1591 =cut
1592
1593 sub set_snippet_width {
1594 my $self = shift;
1595
1596 my ($wwidth, $hwidth, $awidth) = @_;
1597 $self->{wwidth} = $wwidth;
1598 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1599 $self->{awidth} = $awidth if ($awidth >= 0);
1600 }
1601
1602
1603 =head2 set_user
1604
1605 Manage users of node
1606
1607 $node->set_user( 'name', $mode );
1608
1609 C<$mode> can be one of:
1610
1611 =over 4
1612
1613 =item 0
1614
1615 delete account
1616
1617 =item 1
1618
1619 set administrative right for user
1620
1621 =item 2
1622
1623 set user account as guest
1624
1625 =back
1626
1627 Return true on success, otherwise false.
1628
1629 =cut
1630
1631 sub set_user {
1632 my $self = shift;
1633 my ($name, $mode) = @_;
1634
1635 return unless ($self->{url});
1636 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1637
1638 $self->shuttle_url( $self->{url} . '/_set_user',
1639 'text/plain',
1640 'name=' . uri_escape($name) . '&mode=' . $mode,
1641 undef
1642 ) == 200;
1643 }
1644
1645
1646 =head2 set_link
1647
1648 Manage node links
1649
1650 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1651
1652 If C<$credit> is negative, link is removed.
1653
1654 =cut
1655
1656 sub set_link {
1657 my $self = shift;
1658 my ($url, $label, $credit) = @_;
1659
1660 return unless ($self->{url});
1661 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1662
1663 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1664 $reqbody .= '&credit=' . $credit if ($credit > 0);
1665
1666 if ($self->shuttle_url( $self->{url} . '/_set_link',
1667 'application/x-www-form-urlencoded',
1668 $reqbody,
1669 undef
1670 ) == 200) {
1671 # refresh node info after adding link
1672 $self->_set_info;
1673 return 1;
1674 }
1675 }
1676
1677 =head2 admins
1678
1679 my @admins = @{ $node->admins };
1680
1681 Return array of users with admin rights on node
1682
1683 =cut
1684
1685 sub admins {
1686 my $self = shift;
1687 $self->_set_info unless ($self->{inform}->{name});
1688 return $self->{inform}->{admins};
1689 }
1690
1691 =head2 guests
1692
1693 my @guests = @{ $node->guests };
1694
1695 Return array of users with guest rights on node
1696
1697 =cut
1698
1699 sub guests {
1700 my $self = shift;
1701 $self->_set_info unless ($self->{inform}->{name});
1702 return $self->{inform}->{guests};
1703 }
1704
1705 =head2 links
1706
1707 my $links = @{ $node->links };
1708
1709 Return array of links for this node
1710
1711 =cut
1712
1713 sub links {
1714 my $self = shift;
1715 $self->_set_info unless ($self->{inform}->{name});
1716 return $self->{inform}->{links};
1717 }
1718
1719
1720 =head1 PRIVATE METHODS
1721
1722 You could call those directly, but you don't have to. I hope.
1723
1724 =head2 _set_info
1725
1726 Set information for node
1727
1728 $node->_set_info;
1729
1730 =cut
1731
1732 sub _set_info {
1733 my $self = shift;
1734
1735 $self->{status} = -1;
1736 return unless ($self->{url});
1737
1738 my $resbody;
1739 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1740 'text/plain',
1741 undef,
1742 \$resbody,
1743 );
1744
1745 return if ($rv != 200 || !$resbody);
1746
1747 my @lines = split(/[\r\n]/,$resbody);
1748
1749 $self->{inform} = {};
1750
1751 ( $self->{inform}->{name}, $self->{inform}->{label}, $self->{inform}->{dnum},
1752 $self->{inform}->{wnum}, $self->{inform}->{size} ) = split(/\t/, shift @lines, 5);
1753
1754 return $resbody unless (@lines);
1755
1756 shift @lines;
1757
1758 while(my $admin = shift @lines) {
1759 push @{$self->{inform}->{admins}}, $admin;
1760 }
1761
1762 while(my $guest = shift @lines) {
1763 push @{$self->{inform}->{guests}}, $guest;
1764 }
1765
1766 while(my $link = shift @lines) {
1767 push @{$self->{inform}->{links}}, $link;
1768 }
1769
1770 return $resbody;
1771
1772 }
1773
1774 ###
1775
1776 =head1 EXPORT
1777
1778 Nothing.
1779
1780 =head1 SEE ALSO
1781
1782 L<http://hyperestraier.sourceforge.net/>
1783
1784 Hyper Estraier Ruby interface on which this module is based.
1785
1786 =head1 AUTHOR
1787
1788 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1789
1790
1791 =head1 COPYRIGHT AND LICENSE
1792
1793 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1794
1795 This library is free software; you can redistribute it and/or modify
1796 it under the GPL v2 or later.
1797
1798 =cut
1799
1800 1;

  ViewVC Help
Powered by ViewVC 1.1.26