/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 108 - (show annotations)
Sun Feb 19 17:13:57 2006 UTC (18 years, 1 month ago) by dpavlin
File size: 31582 byte(s)
fix typo
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_2';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node(
21 url => 'http://localhost:1978/node/test',
22 user => 'admin',
23 passwd => 'admin'
24 );
25
26 # create document
27 my $doc = new Search::Estraier::Document;
28
29 # add attributes
30 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
31 $doc->add_attr('@title', "Over the Rainbow");
32
33 # add body text to document
34 $doc->add_text("Somewhere over the rainbow. Way up high.");
35 $doc->add_text("There's a land that I heard of once in a lullaby.");
36
37 die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
38
39 =head2 Simple searcher
40
41 use Search::Estraier;
42
43 # create and configure node
44 my $node = new Search::Estraier::Node(
45 url => 'http://localhost:1978/node/test',
46 user => 'admin',
47 passwd => 'admin',
48 croak_on_error => 1,
49 );
50
51 # create condition
52 my $cond = new Search::Estraier::Condition;
53
54 # set search phrase
55 $cond->set_phrase("rainbow AND lullaby");
56
57 my $nres = $node->search($cond, 0);
58
59 if (defined($nres)) {
60 print "Got ", $nres->hits, " results\n";
61
62 # for each document in results
63 for my $i ( 0 ... $nres->doc_num - 1 ) {
64 # get result document
65 my $rdoc = $nres->get_doc($i);
66 # display attribte
67 print "URI: ", $rdoc->attr('@uri'),"\n";
68 print "Title: ", $rdoc->attr('@title'),"\n";
69 print $rdoc->snippet,"\n";
70 }
71 } else {
72 die "error: ", $node->status,"\n";
73 }
74
75 =head1 DESCRIPTION
76
77 This module is implementation of node API of Hyper Estraier. Since it's
78 perl-only module with dependencies only on standard perl modules, it will
79 run on all platforms on which perl runs. It doesn't require compilation
80 or Hyper Estraier development files on target machine.
81
82 It is implemented as multiple packages which closly resamble Ruby
83 implementation. It also includes methods to manage nodes.
84
85 There are few examples in C<scripts> directory of this distribution.
86
87 =cut
88
89 =head1 Inheritable common methods
90
91 This methods should really move somewhere else.
92
93 =head2 _s
94
95 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
96
97 my $text = $self->_s(" this is a text ");
98 $text = 'this is a text';
99
100 =cut
101
102 sub _s {
103 my $text = $_[1];
104 return unless defined($text);
105 $text =~ s/\s\s+/ /gs;
106 $text =~ s/^\s+//;
107 $text =~ s/\s+$//;
108 return $text;
109 }
110
111 package Search::Estraier::Document;
112
113 use Carp qw/croak confess/;
114
115 use Search::Estraier;
116 our @ISA = qw/Search::Estraier/;
117
118 =head1 Search::Estraier::Document
119
120 This class implements Document which is collection of attributes
121 (key=value), vectors (also key value) display text and hidden text.
122
123
124 =head2 new
125
126 Create new document, empty or from draft.
127
128 my $doc = new Search::HyperEstraier::Document;
129 my $doc2 = new Search::HyperEstraier::Document( $draft );
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 $self->{id} = -1;
139
140 my $draft = shift;
141
142 if ($draft) {
143 my $in_text = 0;
144 foreach my $line (split(/\n/, $draft)) {
145
146 if ($in_text) {
147 if ($line =~ /^\t/) {
148 push @{ $self->{htexts} }, substr($line, 1);
149 } else {
150 push @{ $self->{dtexts} }, $line;
151 }
152 next;
153 }
154
155 if ($line =~ m/^%VECTOR\t(.+)$/) {
156 my @fields = split(/\t/, $1);
157 for my $i ( 0 .. ($#fields - 1) ) {
158 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
159 $i++;
160 }
161 next;
162 } elsif ($line =~ m/^%/) {
163 # What is this? comment?
164 #warn "$line\n";
165 next;
166 } elsif ($line =~ m/^$/) {
167 $in_text = 1;
168 next;
169 } elsif ($line =~ m/^(.+)=(.*)$/) {
170 $self->{attrs}->{ $1 } = $2;
171 next;
172 }
173
174 warn "draft ignored: '$line'\n";
175 }
176 }
177
178 $self ? return $self : return undef;
179 }
180
181
182 =head2 add_attr
183
184 Add an attribute.
185
186 $doc->add_attr( name => 'value' );
187
188 Delete attribute using
189
190 $doc->add_attr( name => undef );
191
192 =cut
193
194 sub add_attr {
195 my $self = shift;
196 my $attrs = {@_};
197
198 while (my ($name, $value) = each %{ $attrs }) {
199 if (! defined($value)) {
200 delete( $self->{attrs}->{ $self->_s($name) } );
201 } else {
202 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
203 }
204 }
205
206 return 1;
207 }
208
209
210 =head2 add_text
211
212 Add a sentence of text.
213
214 $doc->add_text('this is example text to display');
215
216 =cut
217
218 sub add_text {
219 my $self = shift;
220 my $text = shift;
221 return unless defined($text);
222
223 push @{ $self->{dtexts} }, $self->_s($text);
224 }
225
226
227 =head2 add_hidden_text
228
229 Add a hidden sentence.
230
231 $doc->add_hidden_text('this is example text just for search');
232
233 =cut
234
235 sub add_hidden_text {
236 my $self = shift;
237 my $text = shift;
238 return unless defined($text);
239
240 push @{ $self->{htexts} }, $self->_s($text);
241 }
242
243
244 =head2 id
245
246 Get the ID number of document. If the object has never been registred, C<-1> is returned.
247
248 print $doc->id;
249
250 =cut
251
252 sub id {
253 my $self = shift;
254 return $self->{id};
255 }
256
257
258 =head2 attr_names
259
260 Returns array with attribute names from document object.
261
262 my @attrs = $doc->attr_names;
263
264 =cut
265
266 sub attr_names {
267 my $self = shift;
268 return unless ($self->{attrs});
269 #croak "attr_names return array, not scalar" if (! wantarray);
270 return sort keys %{ $self->{attrs} };
271 }
272
273
274 =head2 attr
275
276 Returns value of an attribute.
277
278 my $value = $doc->attr( 'attribute' );
279
280 =cut
281
282 sub attr {
283 my $self = shift;
284 my $name = shift;
285 return unless (defined($name) && $self->{attrs});
286 return $self->{attrs}->{ $name };
287 }
288
289
290 =head2 texts
291
292 Returns array with text sentences.
293
294 my @texts = $doc->texts;
295
296 =cut
297
298 sub texts {
299 my $self = shift;
300 #confess "texts return array, not scalar" if (! wantarray);
301 return @{ $self->{dtexts} } if ($self->{dtexts});
302 }
303
304
305 =head2 cat_texts
306
307 Return whole text as single scalar.
308
309 my $text = $doc->cat_texts;
310
311 =cut
312
313 sub cat_texts {
314 my $self = shift;
315 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
316 }
317
318
319 =head2 dump_draft
320
321 Dump draft data from document object.
322
323 print $doc->dump_draft;
324
325 =cut
326
327 sub dump_draft {
328 my $self = shift;
329 my $draft;
330
331 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
332 next unless defined(my $v = $self->{attrs}->{$attr_name});
333 $draft .= $attr_name . '=' . $v . "\n";
334 }
335
336 if ($self->{kwords}) {
337 $draft .= '%%VECTOR';
338 while (my ($key, $value) = each %{ $self->{kwords} }) {
339 $draft .= "\t$key\t$value";
340 }
341 $draft .= "\n";
342 }
343
344 $draft .= "\n";
345
346 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
347 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
348
349 return $draft;
350 }
351
352
353 =head2 delete
354
355 Empty document object
356
357 $doc->delete;
358
359 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
360 convinience. Document objects which go out of scope will be destroyed
361 automatically.
362
363 =cut
364
365 sub delete {
366 my $self = shift;
367
368 foreach my $data (qw/attrs dtexts stexts kwords/) {
369 delete($self->{$data});
370 }
371
372 $self->{id} = -1;
373
374 return 1;
375 }
376
377
378
379 package Search::Estraier::Condition;
380
381 use Carp qw/carp confess croak/;
382
383 use Search::Estraier;
384 our @ISA = qw/Search::Estraier/;
385
386 =head1 Search::Estraier::Condition
387
388 =head2 new
389
390 my $cond = new Search::HyperEstraier::Condition;
391
392 =cut
393
394 sub new {
395 my $class = shift;
396 my $self = {};
397 bless($self, $class);
398
399 $self->{max} = -1;
400 $self->{options} = 0;
401
402 $self ? return $self : return undef;
403 }
404
405
406 =head2 set_phrase
407
408 $cond->set_phrase('search phrase');
409
410 =cut
411
412 sub set_phrase {
413 my $self = shift;
414 $self->{phrase} = $self->_s( shift );
415 }
416
417
418 =head2 add_attr
419
420 $cond->add_attr('@URI STRINC /~dpavlin/');
421
422 =cut
423
424 sub add_attr {
425 my $self = shift;
426 my $attr = shift || return;
427 push @{ $self->{attrs} }, $self->_s( $attr );
428 }
429
430
431 =head2 set_order
432
433 $cond->set_order('@mdate NUMD');
434
435 =cut
436
437 sub set_order {
438 my $self = shift;
439 $self->{order} = shift;
440 }
441
442
443 =head2 set_max
444
445 $cond->set_max(42);
446
447 =cut
448
449 sub set_max {
450 my $self = shift;
451 my $max = shift;
452 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
453 $self->{max} = $max;
454 }
455
456
457 =head2 set_options
458
459 $cond->set_options( 'SURE' );
460
461 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
462
463 Possible options are:
464
465 =over 8
466
467 =item SURE
468
469 check every N-gram
470
471 =item USUAL
472
473 check every second N-gram
474
475 =item FAST
476
477 check every third N-gram
478
479 =item AGITO
480
481 check every fourth N-gram
482
483 =item NOIDF
484
485 don't perform TF-IDF tuning
486
487 =item SIMPLE
488
489 use simplified query phrase
490
491 =back
492
493 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
494 options;
495
496 This option changed in version C<0.04> of this module. It's backwards compatibile.
497
498 =cut
499
500 my $options = {
501 SURE => 1 << 0,
502 USUAL => 1 << 1,
503 FAST => 1 << 2,
504 AGITO => 1 << 3,
505 NOIDF => 1 << 4,
506 SIMPLE => 1 << 10,
507 };
508
509 sub set_options {
510 my $self = shift;
511 my $opt = 0;
512 foreach my $option (@_) {
513 my $mask;
514 unless ($mask = $options->{$option}) {
515 if ($option eq '1') {
516 next;
517 } else {
518 croak "unknown option $option";
519 }
520 }
521 $opt += $mask;
522 }
523 $self->{options} = $opt;
524 }
525
526
527 =head2 phrase
528
529 Return search phrase.
530
531 print $cond->phrase;
532
533 =cut
534
535 sub phrase {
536 my $self = shift;
537 return $self->{phrase};
538 }
539
540
541 =head2 order
542
543 Return search result order.
544
545 print $cond->order;
546
547 =cut
548
549 sub order {
550 my $self = shift;
551 return $self->{order};
552 }
553
554
555 =head2 attrs
556
557 Return search result attrs.
558
559 my @cond_attrs = $cond->attrs;
560
561 =cut
562
563 sub attrs {
564 my $self = shift;
565 #croak "attrs return array, not scalar" if (! wantarray);
566 return @{ $self->{attrs} } if ($self->{attrs});
567 }
568
569
570 =head2 max
571
572 Return maximum number of results.
573
574 print $cond->max;
575
576 C<-1> is returned for unitialized value, C<0> is unlimited.
577
578 =cut
579
580 sub max {
581 my $self = shift;
582 return $self->{max};
583 }
584
585
586 =head2 options
587
588 Return options for this condition.
589
590 print $cond->options;
591
592 Options are returned in numerical form.
593
594 =cut
595
596 sub options {
597 my $self = shift;
598 return $self->{options};
599 }
600
601
602 package Search::Estraier::ResultDocument;
603
604 use Carp qw/croak/;
605
606 #use Search::Estraier;
607 #our @ISA = qw/Search::Estraier/;
608
609 =head1 Search::Estraier::ResultDocument
610
611 =head2 new
612
613 my $rdoc = new Search::HyperEstraier::ResultDocument(
614 uri => 'http://localhost/document/uri/42',
615 attrs => {
616 foo => 1,
617 bar => 2,
618 },
619 snippet => 'this is a text of snippet'
620 keywords => 'this\tare\tkeywords'
621 );
622
623 =cut
624
625 sub new {
626 my $class = shift;
627 my $self = {@_};
628 bless($self, $class);
629
630 croak "missing uri for ResultDocument" unless defined($self->{uri});
631
632 $self ? return $self : return undef;
633 }
634
635
636 =head2 uri
637
638 Return URI of result document
639
640 print $rdoc->uri;
641
642 =cut
643
644 sub uri {
645 my $self = shift;
646 return $self->{uri};
647 }
648
649
650 =head2 attr_names
651
652 Returns array with attribute names from result document object.
653
654 my @attrs = $rdoc->attr_names;
655
656 =cut
657
658 sub attr_names {
659 my $self = shift;
660 croak "attr_names return array, not scalar" if (! wantarray);
661 return sort keys %{ $self->{attrs} };
662 }
663
664
665 =head2 attr
666
667 Returns value of an attribute.
668
669 my $value = $rdoc->attr( 'attribute' );
670
671 =cut
672
673 sub attr {
674 my $self = shift;
675 my $name = shift || return;
676 return $self->{attrs}->{ $name };
677 }
678
679
680 =head2 snippet
681
682 Return snippet from result document
683
684 print $rdoc->snippet;
685
686 =cut
687
688 sub snippet {
689 my $self = shift;
690 return $self->{snippet};
691 }
692
693
694 =head2 keywords
695
696 Return keywords from result document
697
698 print $rdoc->keywords;
699
700 =cut
701
702 sub keywords {
703 my $self = shift;
704 return $self->{keywords};
705 }
706
707
708 package Search::Estraier::NodeResult;
709
710 use Carp qw/croak/;
711
712 #use Search::Estraier;
713 #our @ISA = qw/Search::Estraier/;
714
715 =head1 Search::Estraier::NodeResult
716
717 =head2 new
718
719 my $res = new Search::HyperEstraier::NodeResult(
720 docs => @array_of_rdocs,
721 hits => %hash_with_hints,
722 );
723
724 =cut
725
726 sub new {
727 my $class = shift;
728 my $self = {@_};
729 bless($self, $class);
730
731 foreach my $f (qw/docs hints/) {
732 croak "missing $f for ResultDocument" unless defined($self->{$f});
733 }
734
735 $self ? return $self : return undef;
736 }
737
738
739 =head2 doc_num
740
741 Return number of documents
742
743 print $res->doc_num;
744
745 This will return real number of documents (limited by C<max>).
746 If you want to get total number of hits, see C<hits>.
747
748 =cut
749
750 sub doc_num {
751 my $self = shift;
752 return $#{$self->{docs}} + 1;
753 }
754
755
756 =head2 get_doc
757
758 Return single document
759
760 my $doc = $res->get_doc( 42 );
761
762 Returns undef if document doesn't exist.
763
764 =cut
765
766 sub get_doc {
767 my $self = shift;
768 my $num = shift;
769 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
770 return undef if ($num < 0 || $num > $self->{docs});
771 return $self->{docs}->[$num];
772 }
773
774
775 =head2 hint
776
777 Return specific hint from results.
778
779 print $res->hint( 'VERSION' );
780
781 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
782 C<TIME>, C<LINK#n>, C<VIEW>.
783
784 =cut
785
786 sub hint {
787 my $self = shift;
788 my $key = shift || return;
789 return $self->{hints}->{$key};
790 }
791
792 =head2 hints
793
794 More perlish version of C<hint>. This one returns hash.
795
796 my %hints = $res->hints;
797
798 =cut
799
800 sub hints {
801 my $self = shift;
802 return $self->{hints};
803 }
804
805 =head2 hits
806
807 Syntaxtic sugar for total number of hits for this query
808
809 print $res->hits;
810
811 It's same as
812
813 print $res->hint('HIT');
814
815 but shorter.
816
817 =cut
818
819 sub hits {
820 my $self = shift;
821 return $self->{hints}->{'HIT'} || 0;
822 }
823
824 package Search::Estraier::Node;
825
826 use Carp qw/carp croak confess/;
827 use URI;
828 use MIME::Base64;
829 use IO::Socket::INET;
830 use URI::Escape qw/uri_escape/;
831
832 =head1 Search::Estraier::Node
833
834 =head2 new
835
836 my $node = new Search::HyperEstraier::Node;
837
838 or optionally with C<url> as parametar
839
840 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
841
842 or in more verbose form
843
844 my $node = new Search::HyperEstraier::Node(
845 url => 'http://localhost:1978/node/test',
846 debug => 1,
847 croak_on_error => 1
848 );
849
850 with following arguments:
851
852 =over 4
853
854 =item url
855
856 URL to node
857
858 =item debug
859
860 dumps a B<lot> of debugging output
861
862 =item croak_on_error
863
864 very helpful during development. It will croak on all errors instead of
865 silently returning C<-1> (which is convention of Hyper Estraier API in other
866 languages).
867
868 =back
869
870 =cut
871
872 sub new {
873 my $class = shift;
874 my $self = {
875 pxport => -1,
876 timeout => 0, # this used to be -1
877 dnum => -1,
878 wnum => -1,
879 size => -1.0,
880 wwidth => 480,
881 hwidth => 96,
882 awidth => 96,
883 status => -1,
884 };
885 bless($self, $class);
886
887 if ($#_ == 0) {
888 $self->{url} = shift;
889 } else {
890 my $args = {@_};
891
892 %$self = ( %$self, @_ );
893
894 warn "## Node debug on\n" if ($self->{debug});
895 }
896
897 $self ? return $self : return undef;
898 }
899
900
901 =head2 set_url
902
903 Specify URL to node server
904
905 $node->set_url('http://localhost:1978');
906
907 =cut
908
909 sub set_url {
910 my $self = shift;
911 $self->{url} = shift;
912 }
913
914
915 =head2 set_proxy
916
917 Specify proxy server to connect to node server
918
919 $node->set_proxy('proxy.example.com', 8080);
920
921 =cut
922
923 sub set_proxy {
924 my $self = shift;
925 my ($host,$port) = @_;
926 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
927 $self->{pxhost} = $host;
928 $self->{pxport} = $port;
929 }
930
931
932 =head2 set_timeout
933
934 Specify timeout of connection in seconds
935
936 $node->set_timeout( 15 );
937
938 =cut
939
940 sub set_timeout {
941 my $self = shift;
942 my $sec = shift;
943 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
944 $self->{timeout} = $sec;
945 }
946
947
948 =head2 set_auth
949
950 Specify name and password for authentication to node server.
951
952 $node->set_auth('clint','eastwood');
953
954 =cut
955
956 sub set_auth {
957 my $self = shift;
958 my ($login,$passwd) = @_;
959 my $basic_auth = encode_base64( "$login:$passwd" );
960 chomp($basic_auth);
961 $self->{auth} = $basic_auth;
962 }
963
964
965 =head2 status
966
967 Return status code of last request.
968
969 print $node->status;
970
971 C<-1> means connection failure.
972
973 =cut
974
975 sub status {
976 my $self = shift;
977 return $self->{status};
978 }
979
980
981 =head2 put_doc
982
983 Add a document
984
985 $node->put_doc( $document_draft ) or die "can't add document";
986
987 Return true on success or false on failture.
988
989 =cut
990
991 sub put_doc {
992 my $self = shift;
993 my $doc = shift || return;
994 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
995 $self->shuttle_url( $self->{url} . '/put_doc',
996 'text/x-estraier-draft',
997 $doc->dump_draft,
998 undef
999 ) == 200;
1000 }
1001
1002
1003 =head2 out_doc
1004
1005 Remove a document
1006
1007 $node->out_doc( document_id ) or "can't remove document";
1008
1009 Return true on success or false on failture.
1010
1011 =cut
1012
1013 sub out_doc {
1014 my $self = shift;
1015 my $id = shift || return;
1016 return unless ($self->{url});
1017 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1018 $self->shuttle_url( $self->{url} . '/out_doc',
1019 'application/x-www-form-urlencoded',
1020 "id=$id",
1021 undef
1022 ) == 200;
1023 }
1024
1025
1026 =head2 out_doc_by_uri
1027
1028 Remove a registrated document using it's uri
1029
1030 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1031
1032 Return true on success or false on failture.
1033
1034 =cut
1035
1036 sub out_doc_by_uri {
1037 my $self = shift;
1038 my $uri = shift || return;
1039 return unless ($self->{url});
1040 $self->shuttle_url( $self->{url} . '/out_doc',
1041 'application/x-www-form-urlencoded',
1042 "uri=" . uri_escape($uri),
1043 undef
1044 ) == 200;
1045 }
1046
1047
1048 =head2 edit_doc
1049
1050 Edit attributes of a document
1051
1052 $node->edit_doc( $document_draft ) or die "can't edit document";
1053
1054 Return true on success or false on failture.
1055
1056 =cut
1057
1058 sub edit_doc {
1059 my $self = shift;
1060 my $doc = shift || return;
1061 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1062 $self->shuttle_url( $self->{url} . '/edit_doc',
1063 'text/x-estraier-draft',
1064 $doc->dump_draft,
1065 undef
1066 ) == 200;
1067 }
1068
1069
1070 =head2 get_doc
1071
1072 Retreive document
1073
1074 my $doc = $node->get_doc( document_id ) or die "can't get document";
1075
1076 Return true on success or false on failture.
1077
1078 =cut
1079
1080 sub get_doc {
1081 my $self = shift;
1082 my $id = shift || return;
1083 return $self->_fetch_doc( id => $id );
1084 }
1085
1086
1087 =head2 get_doc_by_uri
1088
1089 Retreive document
1090
1091 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1092
1093 Return true on success or false on failture.
1094
1095 =cut
1096
1097 sub get_doc_by_uri {
1098 my $self = shift;
1099 my $uri = shift || return;
1100 return $self->_fetch_doc( uri => $uri );
1101 }
1102
1103
1104 =head2 get_doc_attr
1105
1106 Retrieve the value of an atribute from object
1107
1108 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1109 die "can't get document attribute";
1110
1111 =cut
1112
1113 sub get_doc_attr {
1114 my $self = shift;
1115 my ($id,$name) = @_;
1116 return unless ($id && $name);
1117 return $self->_fetch_doc( id => $id, attr => $name );
1118 }
1119
1120
1121 =head2 get_doc_attr_by_uri
1122
1123 Retrieve the value of an atribute from object
1124
1125 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1126 die "can't get document attribute";
1127
1128 =cut
1129
1130 sub get_doc_attr_by_uri {
1131 my $self = shift;
1132 my ($uri,$name) = @_;
1133 return unless ($uri && $name);
1134 return $self->_fetch_doc( uri => $uri, attr => $name );
1135 }
1136
1137
1138 =head2 etch_doc
1139
1140 Exctract document keywords
1141
1142 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1143
1144 =cut
1145
1146 sub etch_doc {
1147 my $self = shift;
1148 my $id = shift || return;
1149 return $self->_fetch_doc( id => $id, etch => 1 );
1150 }
1151
1152 =head2 etch_doc_by_uri
1153
1154 Retreive document
1155
1156 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1157
1158 Return true on success or false on failture.
1159
1160 =cut
1161
1162 sub etch_doc_by_uri {
1163 my $self = shift;
1164 my $uri = shift || return;
1165 return $self->_fetch_doc( uri => $uri, etch => 1 );
1166 }
1167
1168
1169 =head2 uri_to_id
1170
1171 Get ID of document specified by URI
1172
1173 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1174
1175 This method won't croak, even if using C<croak_on_error>.
1176
1177 =cut
1178
1179 sub uri_to_id {
1180 my $self = shift;
1181 my $uri = shift || return;
1182 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1, croak_on_error => 0 );
1183 }
1184
1185
1186 =head2 _fetch_doc
1187
1188 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1189 C<etch_doc>, C<etch_doc_by_uri>.
1190
1191 # this will decode received draft into Search::Estraier::Document object
1192 my $doc = $node->_fetch_doc( id => 42 );
1193 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1194
1195 # to extract keywords, add etch
1196 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1197 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1198
1199 # to get document attrubute add attr
1200 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1201 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1202
1203 # more general form which allows implementation of
1204 # uri_to_id
1205 my $id = $node->_fetch_doc(
1206 uri => 'file:///document/uri/42',
1207 path => '/uri_to_id',
1208 chomp_resbody => 1
1209 );
1210
1211 =cut
1212
1213 sub _fetch_doc {
1214 my $self = shift;
1215 my $a = {@_};
1216 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1217
1218 my ($arg, $resbody);
1219
1220 my $path = $a->{path} || '/get_doc';
1221 $path = '/etch_doc' if ($a->{etch});
1222
1223 if ($a->{id}) {
1224 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1225 $arg = 'id=' . $a->{id};
1226 } elsif ($a->{uri}) {
1227 $arg = 'uri=' . uri_escape($a->{uri});
1228 } else {
1229 confess "unhandled argument. Need id or uri.";
1230 }
1231
1232 if ($a->{attr}) {
1233 $path = '/get_doc_attr';
1234 $arg .= '&attr=' . uri_escape($a->{attr});
1235 $a->{chomp_resbody} = 1;
1236 }
1237
1238 my $rv = $self->shuttle_url( $self->{url} . $path,
1239 'application/x-www-form-urlencoded',
1240 $arg,
1241 \$resbody,
1242 $a->{croak_on_error},
1243 );
1244
1245 return if ($rv != 200);
1246
1247 if ($a->{etch}) {
1248 $self->{kwords} = {};
1249 return +{} unless ($resbody);
1250 foreach my $l (split(/\n/, $resbody)) {
1251 my ($k,$v) = split(/\t/, $l, 2);
1252 $self->{kwords}->{$k} = $v if ($v);
1253 }
1254 return $self->{kwords};
1255 } elsif ($a->{chomp_resbody}) {
1256 return unless (defined($resbody));
1257 chomp($resbody);
1258 return $resbody;
1259 } else {
1260 return new Search::Estraier::Document($resbody);
1261 }
1262 }
1263
1264
1265 =head2 name
1266
1267 my $node_name = $node->name;
1268
1269 =cut
1270
1271 sub name {
1272 my $self = shift;
1273 $self->_set_info unless ($self->{name});
1274 return $self->{name};
1275 }
1276
1277
1278 =head2 label
1279
1280 my $node_label = $node->label;
1281
1282 =cut
1283
1284 sub label {
1285 my $self = shift;
1286 $self->_set_info unless ($self->{label});
1287 return $self->{label};
1288 }
1289
1290
1291 =head2 doc_num
1292
1293 my $documents_in_node = $node->doc_num;
1294
1295 =cut
1296
1297 sub doc_num {
1298 my $self = shift;
1299 $self->_set_info if ($self->{dnum} < 0);
1300 return $self->{dnum};
1301 }
1302
1303
1304 =head2 word_num
1305
1306 my $words_in_node = $node->word_num;
1307
1308 =cut
1309
1310 sub word_num {
1311 my $self = shift;
1312 $self->_set_info if ($self->{wnum} < 0);
1313 return $self->{wnum};
1314 }
1315
1316
1317 =head2 size
1318
1319 my $node_size = $node->size;
1320
1321 =cut
1322
1323 sub size {
1324 my $self = shift;
1325 $self->_set_info if ($self->{size} < 0);
1326 return $self->{size};
1327 }
1328
1329
1330 =head2 search
1331
1332 Search documents which match condition
1333
1334 my $nres = $node->search( $cond, $depth );
1335
1336 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1337 depth for meta search.
1338
1339 Function results C<Search::Estraier::NodeResult> object.
1340
1341 =cut
1342
1343 sub search {
1344 my $self = shift;
1345 my ($cond, $depth) = @_;
1346 return unless ($cond && defined($depth) && $self->{url});
1347 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1348 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1349
1350 my $resbody;
1351
1352 my $rv = $self->shuttle_url( $self->{url} . '/search',
1353 'application/x-www-form-urlencoded',
1354 $self->cond_to_query( $cond, $depth ),
1355 \$resbody,
1356 );
1357 return if ($rv != 200);
1358
1359 my (@docs, $hints);
1360
1361 my @lines = split(/\n/, $resbody);
1362 return unless (@lines);
1363
1364 my $border = $lines[0];
1365 my $isend = 0;
1366 my $lnum = 1;
1367
1368 while ( $lnum <= $#lines ) {
1369 my $line = $lines[$lnum];
1370 $lnum++;
1371
1372 #warn "## $line\n";
1373 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1374 $isend = $1;
1375 last;
1376 }
1377
1378 if ($line =~ /\t/) {
1379 my ($k,$v) = split(/\t/, $line, 2);
1380 $hints->{$k} = $v;
1381 }
1382 }
1383
1384 my $snum = $lnum;
1385
1386 while( ! $isend && $lnum <= $#lines ) {
1387 my $line = $lines[$lnum];
1388 #warn "# $lnum: $line\n";
1389 $lnum++;
1390
1391 if ($line && $line =~ m/^\Q$border\E/) {
1392 if ($lnum > $snum) {
1393 my $rdattrs;
1394 my $rdvector;
1395 my $rdsnippet;
1396
1397 my $rlnum = $snum;
1398 while ($rlnum < $lnum - 1 ) {
1399 #my $rdline = $self->_s($lines[$rlnum]);
1400 my $rdline = $lines[$rlnum];
1401 $rlnum++;
1402 last unless ($rdline);
1403 if ($rdline =~ /^%/) {
1404 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1405 } elsif($rdline =~ /=/) {
1406 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1407 } else {
1408 confess "invalid format of response";
1409 }
1410 }
1411 while($rlnum < $lnum - 1) {
1412 my $rdline = $lines[$rlnum];
1413 $rlnum++;
1414 $rdsnippet .= "$rdline\n";
1415 }
1416 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1417 if (my $rduri = $rdattrs->{'@uri'}) {
1418 push @docs, new Search::Estraier::ResultDocument(
1419 uri => $rduri,
1420 attrs => $rdattrs,
1421 snippet => $rdsnippet,
1422 keywords => $rdvector,
1423 );
1424 }
1425 }
1426 $snum = $lnum;
1427 #warn "### $line\n";
1428 $isend = 1 if ($line =~ /:END$/);
1429 }
1430
1431 }
1432
1433 if (! $isend) {
1434 warn "received result doesn't have :END\n$resbody";
1435 return;
1436 }
1437
1438 #warn Dumper(\@docs, $hints);
1439
1440 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1441 }
1442
1443
1444 =head2 cond_to_query
1445
1446 Return URI encoded string generated from Search::Estraier::Condition
1447
1448 my $args = $node->cond_to_query( $cond, $depth );
1449
1450 =cut
1451
1452 sub cond_to_query {
1453 my $self = shift;
1454
1455 my $cond = shift || return;
1456 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1457 my $depth = shift;
1458
1459 my @args;
1460
1461 if (my $phrase = $cond->phrase) {
1462 push @args, 'phrase=' . uri_escape($phrase);
1463 }
1464
1465 if (my @attrs = $cond->attrs) {
1466 for my $i ( 0 .. $#attrs ) {
1467 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1468 }
1469 }
1470
1471 if (my $order = $cond->order) {
1472 push @args, 'order=' . uri_escape($order);
1473 }
1474
1475 if (my $max = $cond->max) {
1476 push @args, 'max=' . $max;
1477 } else {
1478 push @args, 'max=' . (1 << 30);
1479 }
1480
1481 if (my $options = $cond->options) {
1482 push @args, 'options=' . $options;
1483 }
1484
1485 push @args, 'depth=' . $depth if ($depth);
1486 push @args, 'wwidth=' . $self->{wwidth};
1487 push @args, 'hwidth=' . $self->{hwidth};
1488 push @args, 'awidth=' . $self->{awidth};
1489
1490 return join('&', @args);
1491 }
1492
1493
1494 =head2 shuttle_url
1495
1496 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1497 master.
1498
1499 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1500
1501 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1502 body will be saved within object.
1503
1504 =cut
1505
1506 use LWP::UserAgent;
1507
1508 sub shuttle_url {
1509 my $self = shift;
1510
1511 my ($url, $content_type, $reqbody, $resbody, $croak_on_error) = @_;
1512
1513 $croak_on_error = $self->{croak_on_error} unless defined($croak_on_error);
1514
1515 $self->{status} = -1;
1516
1517 warn "## $url\n" if ($self->{debug});
1518
1519 $url = new URI($url);
1520 if (
1521 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1522 !$url->host || !$url->port || $url->port < 1
1523 ) {
1524 carp "can't parse $url\n";
1525 return -1;
1526 }
1527
1528 my $ua = LWP::UserAgent->new;
1529 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1530
1531 my $req;
1532 if ($reqbody) {
1533 $req = HTTP::Request->new(POST => $url);
1534 } else {
1535 $req = HTTP::Request->new(GET => $url);
1536 }
1537
1538 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1539 $req->headers->header( 'Connection', 'close' );
1540 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1541 $req->content_type( $content_type );
1542
1543 warn $req->headers->as_string,"\n" if ($self->{debug});
1544
1545 if ($reqbody) {
1546 warn "$reqbody\n" if ($self->{debug});
1547 $req->content( $reqbody );
1548 }
1549
1550 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1551
1552 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1553
1554 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1555
1556 if (! $res->is_success) {
1557 if ($croak_on_error) {
1558 croak("can't get $url: ",$res->status_line);
1559 } else {
1560 return -1;
1561 }
1562 }
1563
1564 $$resbody .= $res->content;
1565
1566 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1567
1568 return $self->{status};
1569 }
1570
1571
1572 =head2 set_snippet_width
1573
1574 Set width of snippets in results
1575
1576 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1577
1578 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1579 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1580
1581 C<$hwidth> specified width of strings from beginning of string. Default
1582 value is C<96>. Negative or zero value keep previous value.
1583
1584 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1585 If negative of zero value is provided previous value is kept unchanged.
1586
1587 =cut
1588
1589 sub set_snippet_width {
1590 my $self = shift;
1591
1592 my ($wwidth, $hwidth, $awidth) = @_;
1593 $self->{wwidth} = $wwidth;
1594 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1595 $self->{awidth} = $awidth if ($awidth >= 0);
1596 }
1597
1598
1599 =head2 set_user
1600
1601 Manage users of node
1602
1603 $node->set_user( 'name', $mode );
1604
1605 C<$mode> can be one of:
1606
1607 =over 4
1608
1609 =item 0
1610
1611 delete account
1612
1613 =item 1
1614
1615 set administrative right for user
1616
1617 =item 2
1618
1619 set user account as guest
1620
1621 =back
1622
1623 Return true on success, otherwise false.
1624
1625 =cut
1626
1627 sub set_user {
1628 my $self = shift;
1629 my ($name, $mode) = @_;
1630
1631 return unless ($self->{url});
1632 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1633
1634 $self->shuttle_url( $self->{url} . '/_set_user',
1635 'text/plain',
1636 'name=' . uri_escape($name) . '&mode=' . $mode,
1637 undef
1638 ) == 200;
1639 }
1640
1641
1642 =head2 set_link
1643
1644 Manage node links
1645
1646 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1647
1648 If C<$credit> is negative, link is removed.
1649
1650 =cut
1651
1652 sub set_link {
1653 my $self = shift;
1654 my ($url, $label, $credit) = @_;
1655
1656 return unless ($self->{url});
1657 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1658
1659 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1660 $reqbody .= '&credit=' . $credit if ($credit > 0);
1661
1662 if ($self->shuttle_url( $self->{url} . '/_set_link',
1663 'application/x-www-form-urlencoded',
1664 $reqbody,
1665 undef
1666 ) == 200) {
1667 # refresh node info after adding link
1668 $self->_set_info;
1669 return 1;
1670 }
1671 }
1672
1673 =head2 admins
1674
1675 my @admins = @{ $node->admins };
1676
1677 Return array of users with admin rights on node
1678
1679 =cut
1680
1681 sub admins {
1682 my $self = shift;
1683 $self->_set_info unless ($self->{name});
1684 return $self->{admins};
1685 }
1686
1687 =head2 guests
1688
1689 my @guests = @{ $node->guests };
1690
1691 Return array of users with guest rights on node
1692
1693 =cut
1694
1695 sub guests {
1696 my $self = shift;
1697 $self->_set_info unless ($self->{name});
1698 return $self->{guests};
1699 }
1700
1701 =head2 links
1702
1703 my $links = @{ $node->links };
1704
1705 Return array of links for this node
1706
1707 =cut
1708
1709 sub links {
1710 my $self = shift;
1711 $self->_set_info unless ($self->{name});
1712 return $self->{links};
1713 }
1714
1715
1716 =head1 PRIVATE METHODS
1717
1718 You could call those directly, but you don't have to. I hope.
1719
1720 =head2 _set_info
1721
1722 Set information for node
1723
1724 $node->_set_info;
1725
1726 =cut
1727
1728 sub _set_info {
1729 my $self = shift;
1730
1731 $self->{status} = -1;
1732 return unless ($self->{url});
1733
1734 my $resbody;
1735 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1736 'text/plain',
1737 undef,
1738 \$resbody,
1739 );
1740
1741 return if ($rv != 200 || !$resbody);
1742
1743 my @lines = split(/[\r\n]/,$resbody);
1744
1745 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1746 split(/\t/, shift @lines, 5);
1747
1748 return $resbody unless (@lines);
1749
1750 shift @lines;
1751
1752 while(my $admin = shift @lines) {
1753 push @{$self->{admins}}, $admin;
1754 }
1755
1756 while(my $guest = shift @lines) {
1757 push @{$self->{guests}}, $guest;
1758 }
1759
1760 while(my $link = shift @lines) {
1761 push @{$self->{links}}, $link;
1762 }
1763
1764 return $resbody;
1765
1766 }
1767
1768 ###
1769
1770 =head1 EXPORT
1771
1772 Nothing.
1773
1774 =head1 SEE ALSO
1775
1776 L<http://hyperestraier.sourceforge.net/>
1777
1778 Hyper Estraier Ruby interface on which this module is based.
1779
1780 =head1 AUTHOR
1781
1782 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1783
1784
1785 =head1 COPYRIGHT AND LICENSE
1786
1787 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1788
1789 This library is free software; you can redistribute it and/or modify
1790 it under the GPL v2 or later.
1791
1792 =cut
1793
1794 1;

  ViewVC Help
Powered by ViewVC 1.1.26