/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 102 - (show annotations)
Sat Jan 28 19:46:20 2006 UTC (18 years, 2 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 30456 byte(s)
more documentation update
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 package Search::Estraier::ResultDocument;
603
604 use Carp qw/croak/;
605
606 #use Search::Estraier;
607 #our @ISA = qw/Search::Estraier/;
608
609 =head1 Search::Estraier::ResultDocument
610
611 =head2 new
612
613 my $rdoc = new Search::HyperEstraier::ResultDocument(
614 uri => 'http://localhost/document/uri/42',
615 attrs => {
616 foo => 1,
617 bar => 2,
618 },
619 snippet => 'this is a text of snippet'
620 keywords => 'this\tare\tkeywords'
621 );
622
623 =cut
624
625 sub new {
626 my $class = shift;
627 my $self = {@_};
628 bless($self, $class);
629
630 croak "missing uri for ResultDocument" unless defined($self->{uri});
631
632 $self ? return $self : return undef;
633 }
634
635
636 =head2 uri
637
638 Return URI of result document
639
640 print $rdoc->uri;
641
642 =cut
643
644 sub uri {
645 my $self = shift;
646 return $self->{uri};
647 }
648
649
650 =head2 attr_names
651
652 Returns array with attribute names from result document object.
653
654 my @attrs = $rdoc->attr_names;
655
656 =cut
657
658 sub attr_names {
659 my $self = shift;
660 croak "attr_names return array, not scalar" if (! wantarray);
661 return sort keys %{ $self->{attrs} };
662 }
663
664
665 =head2 attr
666
667 Returns value of an attribute.
668
669 my $value = $rdoc->attr( 'attribute' );
670
671 =cut
672
673 sub attr {
674 my $self = shift;
675 my $name = shift || return;
676 return $self->{attrs}->{ $name };
677 }
678
679
680 =head2 snippet
681
682 Return snippet from result document
683
684 print $rdoc->snippet;
685
686 =cut
687
688 sub snippet {
689 my $self = shift;
690 return $self->{snippet};
691 }
692
693
694 =head2 keywords
695
696 Return keywords from result document
697
698 print $rdoc->keywords;
699
700 =cut
701
702 sub keywords {
703 my $self = shift;
704 return $self->{keywords};
705 }
706
707
708 package Search::Estraier::NodeResult;
709
710 use Carp qw/croak/;
711
712 #use Search::Estraier;
713 #our @ISA = qw/Search::Estraier/;
714
715 =head1 Search::Estraier::NodeResult
716
717 =head2 new
718
719 my $res = new Search::HyperEstraier::NodeResult(
720 docs => @array_of_rdocs,
721 hits => %hash_with_hints,
722 );
723
724 =cut
725
726 sub new {
727 my $class = shift;
728 my $self = {@_};
729 bless($self, $class);
730
731 foreach my $f (qw/docs hints/) {
732 croak "missing $f for ResultDocument" unless defined($self->{$f});
733 }
734
735 $self ? return $self : return undef;
736 }
737
738
739 =head2 doc_num
740
741 Return number of documents
742
743 print $res->doc_num;
744
745 This will return real number of documents (limited by C<max>).
746 If you want to get total number of hits, see C<hits>.
747
748 =cut
749
750 sub doc_num {
751 my $self = shift;
752 return $#{$self->{docs}} + 1;
753 }
754
755
756 =head2 get_doc
757
758 Return single document
759
760 my $doc = $res->get_doc( 42 );
761
762 Returns undef if document doesn't exist.
763
764 =cut
765
766 sub get_doc {
767 my $self = shift;
768 my $num = shift;
769 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
770 return undef if ($num < 0 || $num > $self->{docs});
771 return $self->{docs}->[$num];
772 }
773
774
775 =head2 hint
776
777 Return specific hint from results.
778
779 print $res->hint( 'VERSION' );
780
781 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
782 C<TIME>, C<LINK#n>, C<VIEW>.
783
784 =cut
785
786 sub hint {
787 my $self = shift;
788 my $key = shift || return;
789 return $self->{hints}->{$key};
790 }
791
792 =head2 hits
793
794 More perlish version of C<hint>. This one returns hash.
795
796 my %hints = $res->hints;
797
798 =cut
799
800 sub hints {
801 my $self = shift;
802 return $self->{hints};
803 }
804
805 =head2 hits
806
807 Syntaxtic sugar for total number of hits for this query
808
809 print $res->hits;
810
811 It's same as
812
813 print $res->hint('HIT');
814
815 but shorter.
816
817 =cut
818
819 sub hits {
820 my $self = shift;
821 return $self->{hints}->{'HIT'} || 0;
822 }
823
824 package Search::Estraier::Node;
825
826 use Carp qw/carp croak confess/;
827 use URI;
828 use MIME::Base64;
829 use IO::Socket::INET;
830 use URI::Escape qw/uri_escape/;
831
832 =head1 Search::Estraier::Node
833
834 =head2 new
835
836 my $node = new Search::HyperEstraier::Node;
837
838 or optionally with C<url> as parametar
839
840 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
841
842 or in more verbose form
843
844 my $node = new Search::HyperEstraier::Node(
845 url => 'http://localhost:1978/node/test',
846 debug => 1,
847 croak_on_error => 1
848 );
849
850 with following arguments:
851
852 =over 4
853
854 =item url
855
856 URL to node
857
858 =item debug
859
860 dumps a B<lot> of debugging output
861
862 =item croak_on_error
863
864 very helpful during development. It will croak on all errors instead of
865 silently returning C<-1> (which is convention of Hyper Estraier API in other
866 languages).
867
868 =back
869
870 =cut
871
872 sub new {
873 my $class = shift;
874 my $self = {
875 pxport => -1,
876 timeout => 0, # this used to be -1
877 dnum => -1,
878 wnum => -1,
879 size => -1.0,
880 wwidth => 480,
881 hwidth => 96,
882 awidth => 96,
883 status => -1,
884 };
885 bless($self, $class);
886
887 if ($#_ == 0) {
888 $self->{url} = shift;
889 } else {
890 my $args = {@_};
891
892 %$self = ( %$self, @_ );
893
894 warn "## Node debug on\n" if ($self->{debug});
895 }
896
897 $self ? return $self : return undef;
898 }
899
900
901 =head2 set_url
902
903 Specify URL to node server
904
905 $node->set_url('http://localhost:1978');
906
907 =cut
908
909 sub set_url {
910 my $self = shift;
911 $self->{url} = shift;
912 }
913
914
915 =head2 set_proxy
916
917 Specify proxy server to connect to node server
918
919 $node->set_proxy('proxy.example.com', 8080);
920
921 =cut
922
923 sub set_proxy {
924 my $self = shift;
925 my ($host,$port) = @_;
926 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
927 $self->{pxhost} = $host;
928 $self->{pxport} = $port;
929 }
930
931
932 =head2 set_timeout
933
934 Specify timeout of connection in seconds
935
936 $node->set_timeout( 15 );
937
938 =cut
939
940 sub set_timeout {
941 my $self = shift;
942 my $sec = shift;
943 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
944 $self->{timeout} = $sec;
945 }
946
947
948 =head2 set_auth
949
950 Specify name and password for authentication to node server.
951
952 $node->set_auth('clint','eastwood');
953
954 =cut
955
956 sub set_auth {
957 my $self = shift;
958 my ($login,$passwd) = @_;
959 my $basic_auth = encode_base64( "$login:$passwd" );
960 chomp($basic_auth);
961 $self->{auth} = $basic_auth;
962 }
963
964
965 =head2 status
966
967 Return status code of last request.
968
969 print $node->status;
970
971 C<-1> means connection failure.
972
973 =cut
974
975 sub status {
976 my $self = shift;
977 return $self->{status};
978 }
979
980
981 =head2 put_doc
982
983 Add a document
984
985 $node->put_doc( $document_draft ) or die "can't add document";
986
987 Return true on success or false on failture.
988
989 =cut
990
991 sub put_doc {
992 my $self = shift;
993 my $doc = shift || return;
994 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
995 $self->shuttle_url( $self->{url} . '/put_doc',
996 'text/x-estraier-draft',
997 $doc->dump_draft,
998 undef
999 ) == 200;
1000 }
1001
1002
1003 =head2 out_doc
1004
1005 Remove a document
1006
1007 $node->out_doc( document_id ) or "can't remove document";
1008
1009 Return true on success or false on failture.
1010
1011 =cut
1012
1013 sub out_doc {
1014 my $self = shift;
1015 my $id = shift || return;
1016 return unless ($self->{url});
1017 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1018 $self->shuttle_url( $self->{url} . '/out_doc',
1019 'application/x-www-form-urlencoded',
1020 "id=$id",
1021 undef
1022 ) == 200;
1023 }
1024
1025
1026 =head2 out_doc_by_uri
1027
1028 Remove a registrated document using it's uri
1029
1030 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1031
1032 Return true on success or false on failture.
1033
1034 =cut
1035
1036 sub out_doc_by_uri {
1037 my $self = shift;
1038 my $uri = shift || return;
1039 return unless ($self->{url});
1040 $self->shuttle_url( $self->{url} . '/out_doc',
1041 'application/x-www-form-urlencoded',
1042 "uri=" . uri_escape($uri),
1043 undef
1044 ) == 200;
1045 }
1046
1047
1048 =head2 edit_doc
1049
1050 Edit attributes of a document
1051
1052 $node->edit_doc( $document_draft ) or die "can't edit document";
1053
1054 Return true on success or false on failture.
1055
1056 =cut
1057
1058 sub edit_doc {
1059 my $self = shift;
1060 my $doc = shift || return;
1061 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1062 $self->shuttle_url( $self->{url} . '/edit_doc',
1063 'text/x-estraier-draft',
1064 $doc->dump_draft,
1065 undef
1066 ) == 200;
1067 }
1068
1069
1070 =head2 get_doc
1071
1072 Retreive document
1073
1074 my $doc = $node->get_doc( document_id ) or die "can't get document";
1075
1076 Return true on success or false on failture.
1077
1078 =cut
1079
1080 sub get_doc {
1081 my $self = shift;
1082 my $id = shift || return;
1083 return $self->_fetch_doc( id => $id );
1084 }
1085
1086
1087 =head2 get_doc_by_uri
1088
1089 Retreive document
1090
1091 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1092
1093 Return true on success or false on failture.
1094
1095 =cut
1096
1097 sub get_doc_by_uri {
1098 my $self = shift;
1099 my $uri = shift || return;
1100 return $self->_fetch_doc( uri => $uri );
1101 }
1102
1103
1104 =head2 get_doc_attr
1105
1106 Retrieve the value of an atribute from object
1107
1108 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1109 die "can't get document attribute";
1110
1111 =cut
1112
1113 sub get_doc_attr {
1114 my $self = shift;
1115 my ($id,$name) = @_;
1116 return unless ($id && $name);
1117 return $self->_fetch_doc( id => $id, attr => $name );
1118 }
1119
1120
1121 =head2 get_doc_attr_by_uri
1122
1123 Retrieve the value of an atribute from object
1124
1125 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1126 die "can't get document attribute";
1127
1128 =cut
1129
1130 sub get_doc_attr_by_uri {
1131 my $self = shift;
1132 my ($uri,$name) = @_;
1133 return unless ($uri && $name);
1134 return $self->_fetch_doc( uri => $uri, attr => $name );
1135 }
1136
1137
1138 =head2 etch_doc
1139
1140 Exctract document keywords
1141
1142 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1143
1144 =cut
1145
1146 sub etch_doc {
1147 my $self = shift;
1148 my $id = shift || return;
1149 return $self->_fetch_doc( id => $id, etch => 1 );
1150 }
1151
1152 =head2 etch_doc_by_uri
1153
1154 Retreive document
1155
1156 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1157
1158 Return true on success or false on failture.
1159
1160 =cut
1161
1162 sub etch_doc_by_uri {
1163 my $self = shift;
1164 my $uri = shift || return;
1165 return $self->_fetch_doc( uri => $uri, etch => 1 );
1166 }
1167
1168
1169 =head2 uri_to_id
1170
1171 Get ID of document specified by URI
1172
1173 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1174
1175 =cut
1176
1177 sub uri_to_id {
1178 my $self = shift;
1179 my $uri = shift || return;
1180 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1181 }
1182
1183
1184 =head2 _fetch_doc
1185
1186 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1187 C<etch_doc>, C<etch_doc_by_uri>.
1188
1189 # this will decode received draft into Search::Estraier::Document object
1190 my $doc = $node->_fetch_doc( id => 42 );
1191 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1192
1193 # to extract keywords, add etch
1194 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1195 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1196
1197 # to get document attrubute add attr
1198 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1199 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1200
1201 # more general form which allows implementation of
1202 # uri_to_id
1203 my $id = $node->_fetch_doc(
1204 uri => 'file:///document/uri/42',
1205 path => '/uri_to_id',
1206 chomp_resbody => 1
1207 );
1208
1209 =cut
1210
1211 sub _fetch_doc {
1212 my $self = shift;
1213 my $a = {@_};
1214 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1215
1216 my ($arg, $resbody);
1217
1218 my $path = $a->{path} || '/get_doc';
1219 $path = '/etch_doc' if ($a->{etch});
1220
1221 if ($a->{id}) {
1222 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1223 $arg = 'id=' . $a->{id};
1224 } elsif ($a->{uri}) {
1225 $arg = 'uri=' . uri_escape($a->{uri});
1226 } else {
1227 confess "unhandled argument. Need id or uri.";
1228 }
1229
1230 if ($a->{attr}) {
1231 $path = '/get_doc_attr';
1232 $arg .= '&attr=' . uri_escape($a->{attr});
1233 $a->{chomp_resbody} = 1;
1234 }
1235
1236 my $rv = $self->shuttle_url( $self->{url} . $path,
1237 'application/x-www-form-urlencoded',
1238 $arg,
1239 \$resbody,
1240 );
1241
1242 return if ($rv != 200);
1243
1244 if ($a->{etch}) {
1245 $self->{kwords} = {};
1246 return +{} unless ($resbody);
1247 foreach my $l (split(/\n/, $resbody)) {
1248 my ($k,$v) = split(/\t/, $l, 2);
1249 $self->{kwords}->{$k} = $v if ($v);
1250 }
1251 return $self->{kwords};
1252 } elsif ($a->{chomp_resbody}) {
1253 return unless (defined($resbody));
1254 chomp($resbody);
1255 return $resbody;
1256 } else {
1257 return new Search::Estraier::Document($resbody);
1258 }
1259 }
1260
1261
1262 =head2 name
1263
1264 my $node_name = $node->name;
1265
1266 =cut
1267
1268 sub name {
1269 my $self = shift;
1270 $self->_set_info unless ($self->{name});
1271 return $self->{name};
1272 }
1273
1274
1275 =head2 label
1276
1277 my $node_label = $node->label;
1278
1279 =cut
1280
1281 sub label {
1282 my $self = shift;
1283 $self->_set_info unless ($self->{label});
1284 return $self->{label};
1285 }
1286
1287
1288 =head2 doc_num
1289
1290 my $documents_in_node = $node->doc_num;
1291
1292 =cut
1293
1294 sub doc_num {
1295 my $self = shift;
1296 $self->_set_info if ($self->{dnum} < 0);
1297 return $self->{dnum};
1298 }
1299
1300
1301 =head2 word_num
1302
1303 my $words_in_node = $node->word_num;
1304
1305 =cut
1306
1307 sub word_num {
1308 my $self = shift;
1309 $self->_set_info if ($self->{wnum} < 0);
1310 return $self->{wnum};
1311 }
1312
1313
1314 =head2 size
1315
1316 my $node_size = $node->size;
1317
1318 =cut
1319
1320 sub size {
1321 my $self = shift;
1322 $self->_set_info if ($self->{size} < 0);
1323 return $self->{size};
1324 }
1325
1326
1327 =head2 search
1328
1329 Search documents which match condition
1330
1331 my $nres = $node->search( $cond, $depth );
1332
1333 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1334 depth for meta search.
1335
1336 Function results C<Search::Estraier::NodeResult> object.
1337
1338 =cut
1339
1340 sub search {
1341 my $self = shift;
1342 my ($cond, $depth) = @_;
1343 return unless ($cond && defined($depth) && $self->{url});
1344 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1345 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1346
1347 my $resbody;
1348
1349 my $rv = $self->shuttle_url( $self->{url} . '/search',
1350 'application/x-www-form-urlencoded',
1351 $self->cond_to_query( $cond, $depth ),
1352 \$resbody,
1353 );
1354 return if ($rv != 200);
1355
1356 my (@docs, $hints);
1357
1358 my @lines = split(/\n/, $resbody);
1359 return unless (@lines);
1360
1361 my $border = $lines[0];
1362 my $isend = 0;
1363 my $lnum = 1;
1364
1365 while ( $lnum <= $#lines ) {
1366 my $line = $lines[$lnum];
1367 $lnum++;
1368
1369 #warn "## $line\n";
1370 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1371 $isend = $1;
1372 last;
1373 }
1374
1375 if ($line =~ /\t/) {
1376 my ($k,$v) = split(/\t/, $line, 2);
1377 $hints->{$k} = $v;
1378 }
1379 }
1380
1381 my $snum = $lnum;
1382
1383 while( ! $isend && $lnum <= $#lines ) {
1384 my $line = $lines[$lnum];
1385 #warn "# $lnum: $line\n";
1386 $lnum++;
1387
1388 if ($line && $line =~ m/^\Q$border\E/) {
1389 if ($lnum > $snum) {
1390 my $rdattrs;
1391 my $rdvector;
1392 my $rdsnippet;
1393
1394 my $rlnum = $snum;
1395 while ($rlnum < $lnum - 1 ) {
1396 #my $rdline = $self->_s($lines[$rlnum]);
1397 my $rdline = $lines[$rlnum];
1398 $rlnum++;
1399 last unless ($rdline);
1400 if ($rdline =~ /^%/) {
1401 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1402 } elsif($rdline =~ /=/) {
1403 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1404 } else {
1405 confess "invalid format of response";
1406 }
1407 }
1408 while($rlnum < $lnum - 1) {
1409 my $rdline = $lines[$rlnum];
1410 $rlnum++;
1411 $rdsnippet .= "$rdline\n";
1412 }
1413 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1414 if (my $rduri = $rdattrs->{'@uri'}) {
1415 push @docs, new Search::Estraier::ResultDocument(
1416 uri => $rduri,
1417 attrs => $rdattrs,
1418 snippet => $rdsnippet,
1419 keywords => $rdvector,
1420 );
1421 }
1422 }
1423 $snum = $lnum;
1424 #warn "### $line\n";
1425 $isend = 1 if ($line =~ /:END$/);
1426 }
1427
1428 }
1429
1430 if (! $isend) {
1431 warn "received result doesn't have :END\n$resbody";
1432 return;
1433 }
1434
1435 #warn Dumper(\@docs, $hints);
1436
1437 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1438 }
1439
1440
1441 =head2 cond_to_query
1442
1443 Return URI encoded string generated from Search::Estraier::Condition
1444
1445 my $args = $node->cond_to_query( $cond, $depth );
1446
1447 =cut
1448
1449 sub cond_to_query {
1450 my $self = shift;
1451
1452 my $cond = shift || return;
1453 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1454 my $depth = shift;
1455
1456 my @args;
1457
1458 if (my $phrase = $cond->phrase) {
1459 push @args, 'phrase=' . uri_escape($phrase);
1460 }
1461
1462 if (my @attrs = $cond->attrs) {
1463 for my $i ( 0 .. $#attrs ) {
1464 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1465 }
1466 }
1467
1468 if (my $order = $cond->order) {
1469 push @args, 'order=' . uri_escape($order);
1470 }
1471
1472 if (my $max = $cond->max) {
1473 push @args, 'max=' . $max;
1474 } else {
1475 push @args, 'max=' . (1 << 30);
1476 }
1477
1478 if (my $options = $cond->options) {
1479 push @args, 'options=' . $options;
1480 }
1481
1482 push @args, 'depth=' . $depth if ($depth);
1483 push @args, 'wwidth=' . $self->{wwidth};
1484 push @args, 'hwidth=' . $self->{hwidth};
1485 push @args, 'awidth=' . $self->{awidth};
1486
1487 return join('&', @args);
1488 }
1489
1490
1491 =head2 shuttle_url
1492
1493 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1494 master.
1495
1496 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1497
1498 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1499 body will be saved within object.
1500
1501 =cut
1502
1503 use LWP::UserAgent;
1504
1505 sub shuttle_url {
1506 my $self = shift;
1507
1508 my ($url, $content_type, $reqbody, $resbody) = @_;
1509
1510 $self->{status} = -1;
1511
1512 warn "## $url\n" if ($self->{debug});
1513
1514 $url = new URI($url);
1515 if (
1516 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1517 !$url->host || !$url->port || $url->port < 1
1518 ) {
1519 carp "can't parse $url\n";
1520 return -1;
1521 }
1522
1523 my $ua = LWP::UserAgent->new;
1524 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1525
1526 my $req;
1527 if ($reqbody) {
1528 $req = HTTP::Request->new(POST => $url);
1529 } else {
1530 $req = HTTP::Request->new(GET => $url);
1531 }
1532
1533 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1534 $req->headers->header( 'Connection', 'close' );
1535 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1536 $req->content_type( $content_type );
1537
1538 warn $req->headers->as_string,"\n" if ($self->{debug});
1539
1540 if ($reqbody) {
1541 warn "$reqbody\n" if ($self->{debug});
1542 $req->content( $reqbody );
1543 }
1544
1545 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1546
1547 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1548
1549 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1550
1551 if (! $res->is_success) {
1552 if ($self->{croak_on_error}) {
1553 croak("can't get $url: ",$res->status_line);
1554 } else {
1555 return -1;
1556 }
1557 }
1558
1559 $$resbody .= $res->content;
1560
1561 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1562
1563 return $self->{status};
1564 }
1565
1566
1567 =head2 set_snippet_width
1568
1569 Set width of snippets in results
1570
1571 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1572
1573 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1574 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1575
1576 C<$hwidth> specified width of strings from beginning of string. Default
1577 value is C<96>. Negative or zero value keep previous value.
1578
1579 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1580 If negative of zero value is provided previous value is kept unchanged.
1581
1582 =cut
1583
1584 sub set_snippet_width {
1585 my $self = shift;
1586
1587 my ($wwidth, $hwidth, $awidth) = @_;
1588 $self->{wwidth} = $wwidth;
1589 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1590 $self->{awidth} = $awidth if ($awidth >= 0);
1591 }
1592
1593
1594 =head2 set_user
1595
1596 Manage users of node
1597
1598 $node->set_user( 'name', $mode );
1599
1600 C<$mode> can be one of:
1601
1602 =over 4
1603
1604 =item 0
1605
1606 delete account
1607
1608 =item 1
1609
1610 set administrative right for user
1611
1612 =item 2
1613
1614 set user account as guest
1615
1616 =back
1617
1618 Return true on success, otherwise false.
1619
1620 =cut
1621
1622 sub set_user {
1623 my $self = shift;
1624 my ($name, $mode) = @_;
1625
1626 return unless ($self->{url});
1627 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1628
1629 $self->shuttle_url( $self->{url} . '/_set_user',
1630 'text/plain',
1631 'name=' . uri_escape($name) . '&mode=' . $mode,
1632 undef
1633 ) == 200;
1634 }
1635
1636
1637 =head2 set_link
1638
1639 Manage node links
1640
1641 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1642
1643 If C<$credit> is negative, link is removed.
1644
1645 =cut
1646
1647 sub set_link {
1648 my $self = shift;
1649 my ($url, $label, $credit) = @_;
1650
1651 return unless ($self->{url});
1652 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1653
1654 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1655 $reqbody .= '&credit=' . $credit if ($credit > 0);
1656
1657 $self->shuttle_url( $self->{url} . '/_set_link',
1658 'application/x-www-form-urlencoded',
1659 $reqbody,
1660 undef
1661 ) == 200;
1662 }
1663
1664
1665 =head1 PRIVATE METHODS
1666
1667 You could call those directly, but you don't have to. I hope.
1668
1669 =head2 _set_info
1670
1671 Set information for node
1672
1673 $node->_set_info;
1674
1675 =cut
1676
1677 sub _set_info {
1678 my $self = shift;
1679
1680 $self->{status} = -1;
1681 return unless ($self->{url});
1682
1683 my $resbody;
1684 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1685 'text/plain',
1686 undef,
1687 \$resbody,
1688 );
1689
1690 return if ($rv != 200 || !$resbody);
1691
1692 # it seems that response can have multiple line endings
1693 $resbody =~ s/[\r\n]+$//;
1694
1695 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1696 split(/\t/, $resbody, 5);
1697
1698 }
1699
1700 ###
1701
1702 =head1 EXPORT
1703
1704 Nothing.
1705
1706 =head1 SEE ALSO
1707
1708 L<http://hyperestraier.sourceforge.net/>
1709
1710 Hyper Estraier Ruby interface on which this module is based.
1711
1712 =head1 AUTHOR
1713
1714 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1715
1716
1717 =head1 COPYRIGHT AND LICENSE
1718
1719 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1720
1721 This library is free software; you can redistribute it and/or modify
1722 it under the GPL v2 or later.
1723
1724 =cut
1725
1726 1;

  ViewVC Help
Powered by ViewVC 1.1.26