/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 100 - (show annotations)
Sat Jan 28 19:41:59 2006 UTC (18 years, 3 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 30421 byte(s)
added $res->hits to get number of hits from estmaster hints
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 print "Got ", $nres->hits, " results\n";
54
55 if (defined($nres)) {
56 # for each document in results
57 for my $i ( 0 ... $nres->doc_num - 1 ) {
58 # get result document
59 my $rdoc = $nres->get_doc($i);
60 # display attribte
61 print "URI: ", $rdoc->attr('@uri'),"\n";
62 print "Title: ", $rdoc->attr('@title'),"\n";
63 print $rdoc->snippet,"\n";
64 }
65 } else {
66 die "error: ", $node->status,"\n";
67 }
68
69 =head1 DESCRIPTION
70
71 This module is implementation of node API of Hyper Estraier. Since it's
72 perl-only module with dependencies only on standard perl modules, it will
73 run on all platforms on which perl runs. It doesn't require compilation
74 or Hyper Estraier development files on target machine.
75
76 It is implemented as multiple packages which closly resamble Ruby
77 implementation. It also includes methods to manage nodes.
78
79 There are few examples in C<scripts> directory of this distribution.
80
81 =cut
82
83 =head1 Inheritable common methods
84
85 This methods should really move somewhere else.
86
87 =head2 _s
88
89 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
90
91 my $text = $self->_s(" this is a text ");
92 $text = 'this is a text';
93
94 =cut
95
96 sub _s {
97 my $text = $_[1];
98 return unless defined($text);
99 $text =~ s/\s\s+/ /gs;
100 $text =~ s/^\s+//;
101 $text =~ s/\s+$//;
102 return $text;
103 }
104
105 package Search::Estraier::Document;
106
107 use Carp qw/croak confess/;
108
109 use Search::Estraier;
110 our @ISA = qw/Search::Estraier/;
111
112 =head1 Search::Estraier::Document
113
114 This class implements Document which is collection of attributes
115 (key=value), vectors (also key value) display text and hidden text.
116
117
118 =head2 new
119
120 Create new document, empty or from draft.
121
122 my $doc = new Search::HyperEstraier::Document;
123 my $doc2 = new Search::HyperEstraier::Document( $draft );
124
125 =cut
126
127 sub new {
128 my $class = shift;
129 my $self = {};
130 bless($self, $class);
131
132 $self->{id} = -1;
133
134 my $draft = shift;
135
136 if ($draft) {
137 my $in_text = 0;
138 foreach my $line (split(/\n/, $draft)) {
139
140 if ($in_text) {
141 if ($line =~ /^\t/) {
142 push @{ $self->{htexts} }, substr($line, 1);
143 } else {
144 push @{ $self->{dtexts} }, $line;
145 }
146 next;
147 }
148
149 if ($line =~ m/^%VECTOR\t(.+)$/) {
150 my @fields = split(/\t/, $1);
151 for my $i ( 0 .. ($#fields - 1) ) {
152 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
153 $i++;
154 }
155 next;
156 } elsif ($line =~ m/^%/) {
157 # What is this? comment?
158 #warn "$line\n";
159 next;
160 } elsif ($line =~ m/^$/) {
161 $in_text = 1;
162 next;
163 } elsif ($line =~ m/^(.+)=(.*)$/) {
164 $self->{attrs}->{ $1 } = $2;
165 next;
166 }
167
168 warn "draft ignored: '$line'\n";
169 }
170 }
171
172 $self ? return $self : return undef;
173 }
174
175
176 =head2 add_attr
177
178 Add an attribute.
179
180 $doc->add_attr( name => 'value' );
181
182 Delete attribute using
183
184 $doc->add_attr( name => undef );
185
186 =cut
187
188 sub add_attr {
189 my $self = shift;
190 my $attrs = {@_};
191
192 while (my ($name, $value) = each %{ $attrs }) {
193 if (! defined($value)) {
194 delete( $self->{attrs}->{ $self->_s($name) } );
195 } else {
196 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
197 }
198 }
199
200 return 1;
201 }
202
203
204 =head2 add_text
205
206 Add a sentence of text.
207
208 $doc->add_text('this is example text to display');
209
210 =cut
211
212 sub add_text {
213 my $self = shift;
214 my $text = shift;
215 return unless defined($text);
216
217 push @{ $self->{dtexts} }, $self->_s($text);
218 }
219
220
221 =head2 add_hidden_text
222
223 Add a hidden sentence.
224
225 $doc->add_hidden_text('this is example text just for search');
226
227 =cut
228
229 sub add_hidden_text {
230 my $self = shift;
231 my $text = shift;
232 return unless defined($text);
233
234 push @{ $self->{htexts} }, $self->_s($text);
235 }
236
237
238 =head2 id
239
240 Get the ID number of document. If the object has never been registred, C<-1> is returned.
241
242 print $doc->id;
243
244 =cut
245
246 sub id {
247 my $self = shift;
248 return $self->{id};
249 }
250
251
252 =head2 attr_names
253
254 Returns array with attribute names from document object.
255
256 my @attrs = $doc->attr_names;
257
258 =cut
259
260 sub attr_names {
261 my $self = shift;
262 return unless ($self->{attrs});
263 #croak "attr_names return array, not scalar" if (! wantarray);
264 return sort keys %{ $self->{attrs} };
265 }
266
267
268 =head2 attr
269
270 Returns value of an attribute.
271
272 my $value = $doc->attr( 'attribute' );
273
274 =cut
275
276 sub attr {
277 my $self = shift;
278 my $name = shift;
279 return unless (defined($name) && $self->{attrs});
280 return $self->{attrs}->{ $name };
281 }
282
283
284 =head2 texts
285
286 Returns array with text sentences.
287
288 my @texts = $doc->texts;
289
290 =cut
291
292 sub texts {
293 my $self = shift;
294 #confess "texts return array, not scalar" if (! wantarray);
295 return @{ $self->{dtexts} } if ($self->{dtexts});
296 }
297
298
299 =head2 cat_texts
300
301 Return whole text as single scalar.
302
303 my $text = $doc->cat_texts;
304
305 =cut
306
307 sub cat_texts {
308 my $self = shift;
309 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
310 }
311
312
313 =head2 dump_draft
314
315 Dump draft data from document object.
316
317 print $doc->dump_draft;
318
319 =cut
320
321 sub dump_draft {
322 my $self = shift;
323 my $draft;
324
325 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
326 next unless defined(my $v = $self->{attrs}->{$attr_name});
327 $draft .= $attr_name . '=' . $v . "\n";
328 }
329
330 if ($self->{kwords}) {
331 $draft .= '%%VECTOR';
332 while (my ($key, $value) = each %{ $self->{kwords} }) {
333 $draft .= "\t$key\t$value";
334 }
335 $draft .= "\n";
336 }
337
338 $draft .= "\n";
339
340 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
341 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
342
343 return $draft;
344 }
345
346
347 =head2 delete
348
349 Empty document object
350
351 $doc->delete;
352
353 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
354 convinience. Document objects which go out of scope will be destroyed
355 automatically.
356
357 =cut
358
359 sub delete {
360 my $self = shift;
361
362 foreach my $data (qw/attrs dtexts stexts kwords/) {
363 delete($self->{$data});
364 }
365
366 $self->{id} = -1;
367
368 return 1;
369 }
370
371
372
373 package Search::Estraier::Condition;
374
375 use Carp qw/carp confess croak/;
376
377 use Search::Estraier;
378 our @ISA = qw/Search::Estraier/;
379
380 =head1 Search::Estraier::Condition
381
382 =head2 new
383
384 my $cond = new Search::HyperEstraier::Condition;
385
386 =cut
387
388 sub new {
389 my $class = shift;
390 my $self = {};
391 bless($self, $class);
392
393 $self->{max} = -1;
394 $self->{options} = 0;
395
396 $self ? return $self : return undef;
397 }
398
399
400 =head2 set_phrase
401
402 $cond->set_phrase('search phrase');
403
404 =cut
405
406 sub set_phrase {
407 my $self = shift;
408 $self->{phrase} = $self->_s( shift );
409 }
410
411
412 =head2 add_attr
413
414 $cond->add_attr('@URI STRINC /~dpavlin/');
415
416 =cut
417
418 sub add_attr {
419 my $self = shift;
420 my $attr = shift || return;
421 push @{ $self->{attrs} }, $self->_s( $attr );
422 }
423
424
425 =head2 set_order
426
427 $cond->set_order('@mdate NUMD');
428
429 =cut
430
431 sub set_order {
432 my $self = shift;
433 $self->{order} = shift;
434 }
435
436
437 =head2 set_max
438
439 $cond->set_max(42);
440
441 =cut
442
443 sub set_max {
444 my $self = shift;
445 my $max = shift;
446 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
447 $self->{max} = $max;
448 }
449
450
451 =head2 set_options
452
453 $cond->set_options( 'SURE' );
454
455 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
456
457 Possible options are:
458
459 =over 8
460
461 =item SURE
462
463 check every N-gram
464
465 =item USUAL
466
467 check every second N-gram
468
469 =item FAST
470
471 check every third N-gram
472
473 =item AGITO
474
475 check every fourth N-gram
476
477 =item NOIDF
478
479 don't perform TF-IDF tuning
480
481 =item SIMPLE
482
483 use simplified query phrase
484
485 =back
486
487 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
488 options;
489
490 This option changed in version C<0.04> of this module. It's backwards compatibile.
491
492 =cut
493
494 my $options = {
495 SURE => 1 << 0,
496 USUAL => 1 << 1,
497 FAST => 1 << 2,
498 AGITO => 1 << 3,
499 NOIDF => 1 << 4,
500 SIMPLE => 1 << 10,
501 };
502
503 sub set_options {
504 my $self = shift;
505 my $opt = 0;
506 foreach my $option (@_) {
507 my $mask;
508 unless ($mask = $options->{$option}) {
509 if ($option eq '1') {
510 next;
511 } else {
512 croak "unknown option $option";
513 }
514 }
515 $opt += $mask;
516 }
517 $self->{options} = $opt;
518 }
519
520
521 =head2 phrase
522
523 Return search phrase.
524
525 print $cond->phrase;
526
527 =cut
528
529 sub phrase {
530 my $self = shift;
531 return $self->{phrase};
532 }
533
534
535 =head2 order
536
537 Return search result order.
538
539 print $cond->order;
540
541 =cut
542
543 sub order {
544 my $self = shift;
545 return $self->{order};
546 }
547
548
549 =head2 attrs
550
551 Return search result attrs.
552
553 my @cond_attrs = $cond->attrs;
554
555 =cut
556
557 sub attrs {
558 my $self = shift;
559 #croak "attrs return array, not scalar" if (! wantarray);
560 return @{ $self->{attrs} } if ($self->{attrs});
561 }
562
563
564 =head2 max
565
566 Return maximum number of results.
567
568 print $cond->max;
569
570 C<-1> is returned for unitialized value, C<0> is unlimited.
571
572 =cut
573
574 sub max {
575 my $self = shift;
576 return $self->{max};
577 }
578
579
580 =head2 options
581
582 Return options for this condition.
583
584 print $cond->options;
585
586 Options are returned in numerical form.
587
588 =cut
589
590 sub options {
591 my $self = shift;
592 return $self->{options};
593 }
594
595
596 package Search::Estraier::ResultDocument;
597
598 use Carp qw/croak/;
599
600 #use Search::Estraier;
601 #our @ISA = qw/Search::Estraier/;
602
603 =head1 Search::Estraier::ResultDocument
604
605 =head2 new
606
607 my $rdoc = new Search::HyperEstraier::ResultDocument(
608 uri => 'http://localhost/document/uri/42',
609 attrs => {
610 foo => 1,
611 bar => 2,
612 },
613 snippet => 'this is a text of snippet'
614 keywords => 'this\tare\tkeywords'
615 );
616
617 =cut
618
619 sub new {
620 my $class = shift;
621 my $self = {@_};
622 bless($self, $class);
623
624 croak "missing uri for ResultDocument" unless defined($self->{uri});
625
626 $self ? return $self : return undef;
627 }
628
629
630 =head2 uri
631
632 Return URI of result document
633
634 print $rdoc->uri;
635
636 =cut
637
638 sub uri {
639 my $self = shift;
640 return $self->{uri};
641 }
642
643
644 =head2 attr_names
645
646 Returns array with attribute names from result document object.
647
648 my @attrs = $rdoc->attr_names;
649
650 =cut
651
652 sub attr_names {
653 my $self = shift;
654 croak "attr_names return array, not scalar" if (! wantarray);
655 return sort keys %{ $self->{attrs} };
656 }
657
658
659 =head2 attr
660
661 Returns value of an attribute.
662
663 my $value = $rdoc->attr( 'attribute' );
664
665 =cut
666
667 sub attr {
668 my $self = shift;
669 my $name = shift || return;
670 return $self->{attrs}->{ $name };
671 }
672
673
674 =head2 snippet
675
676 Return snippet from result document
677
678 print $rdoc->snippet;
679
680 =cut
681
682 sub snippet {
683 my $self = shift;
684 return $self->{snippet};
685 }
686
687
688 =head2 keywords
689
690 Return keywords from result document
691
692 print $rdoc->keywords;
693
694 =cut
695
696 sub keywords {
697 my $self = shift;
698 return $self->{keywords};
699 }
700
701
702 package Search::Estraier::NodeResult;
703
704 use Carp qw/croak/;
705
706 #use Search::Estraier;
707 #our @ISA = qw/Search::Estraier/;
708
709 =head1 Search::Estraier::NodeResult
710
711 =head2 new
712
713 my $res = new Search::HyperEstraier::NodeResult(
714 docs => @array_of_rdocs,
715 hits => %hash_with_hints,
716 );
717
718 =cut
719
720 sub new {
721 my $class = shift;
722 my $self = {@_};
723 bless($self, $class);
724
725 foreach my $f (qw/docs hints/) {
726 croak "missing $f for ResultDocument" unless defined($self->{$f});
727 }
728
729 $self ? return $self : return undef;
730 }
731
732
733 =head2 doc_num
734
735 Return number of documents
736
737 print $res->doc_num;
738
739 This will return real number of documents (limited by C<max>).
740 If you want to get total number of hits, see C<hits>.
741
742 =cut
743
744 sub doc_num {
745 my $self = shift;
746 return $#{$self->{docs}} + 1;
747 }
748
749
750 =head2 get_doc
751
752 Return single document
753
754 my $doc = $res->get_doc( 42 );
755
756 Returns undef if document doesn't exist.
757
758 =cut
759
760 sub get_doc {
761 my $self = shift;
762 my $num = shift;
763 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
764 return undef if ($num < 0 || $num > $self->{docs});
765 return $self->{docs}->[$num];
766 }
767
768
769 =head2 hint
770
771 Return specific hint from results.
772
773 print $res->hint( 'VERSION' );
774
775 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
776 C<TIME>, C<LINK#n>, C<VIEW>.
777
778 =cut
779
780 sub hint {
781 my $self = shift;
782 my $key = shift || return;
783 return $self->{hints}->{$key};
784 }
785
786 =head2 hits
787
788 More perlish version of C<hint>. This one returns hash.
789
790 my %hints = $res->hints;
791
792 =cut
793
794 sub hints {
795 my $self = shift;
796 return $self->{hints};
797 }
798
799 =head2 hits
800
801 Syntaxtic sugar for total number of hits for this query
802
803 print $res->hits;
804
805 It's same as
806
807 print $res->hint('HIT');
808
809 but shorter.
810
811 =cut
812
813 sub hits {
814 my $self = shift;
815 return $self->{hints}->{'HIT'} || 0;
816 }
817
818 package Search::Estraier::Node;
819
820 use Carp qw/carp croak confess/;
821 use URI;
822 use MIME::Base64;
823 use IO::Socket::INET;
824 use URI::Escape qw/uri_escape/;
825
826 =head1 Search::Estraier::Node
827
828 =head2 new
829
830 my $node = new Search::HyperEstraier::Node;
831
832 or optionally with C<url> as parametar
833
834 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
835
836 or in more verbose form
837
838 my $node = new Search::HyperEstraier::Node(
839 url => 'http://localhost:1978/node/test',
840 debug => 1,
841 croak_on_error => 1
842 );
843
844 with following arguments:
845
846 =over 4
847
848 =item url
849
850 URL to node
851
852 =item debug
853
854 dumps a B<lot> of debugging output
855
856 =item croak_on_error
857
858 very helpful during development. It will croak on all errors instead of
859 silently returning C<-1> (which is convention of Hyper Estraier API in other
860 languages).
861
862 =back
863
864 =cut
865
866 sub new {
867 my $class = shift;
868 my $self = {
869 pxport => -1,
870 timeout => 0, # this used to be -1
871 dnum => -1,
872 wnum => -1,
873 size => -1.0,
874 wwidth => 480,
875 hwidth => 96,
876 awidth => 96,
877 status => -1,
878 };
879 bless($self, $class);
880
881 if ($#_ == 0) {
882 $self->{url} = shift;
883 } else {
884 my $args = {@_};
885
886 %$self = ( %$self, @_ );
887
888 warn "## Node debug on\n" if ($self->{debug});
889 }
890
891 $self ? return $self : return undef;
892 }
893
894
895 =head2 set_url
896
897 Specify URL to node server
898
899 $node->set_url('http://localhost:1978');
900
901 =cut
902
903 sub set_url {
904 my $self = shift;
905 $self->{url} = shift;
906 }
907
908
909 =head2 set_proxy
910
911 Specify proxy server to connect to node server
912
913 $node->set_proxy('proxy.example.com', 8080);
914
915 =cut
916
917 sub set_proxy {
918 my $self = shift;
919 my ($host,$port) = @_;
920 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
921 $self->{pxhost} = $host;
922 $self->{pxport} = $port;
923 }
924
925
926 =head2 set_timeout
927
928 Specify timeout of connection in seconds
929
930 $node->set_timeout( 15 );
931
932 =cut
933
934 sub set_timeout {
935 my $self = shift;
936 my $sec = shift;
937 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
938 $self->{timeout} = $sec;
939 }
940
941
942 =head2 set_auth
943
944 Specify name and password for authentication to node server.
945
946 $node->set_auth('clint','eastwood');
947
948 =cut
949
950 sub set_auth {
951 my $self = shift;
952 my ($login,$passwd) = @_;
953 my $basic_auth = encode_base64( "$login:$passwd" );
954 chomp($basic_auth);
955 $self->{auth} = $basic_auth;
956 }
957
958
959 =head2 status
960
961 Return status code of last request.
962
963 print $node->status;
964
965 C<-1> means connection failure.
966
967 =cut
968
969 sub status {
970 my $self = shift;
971 return $self->{status};
972 }
973
974
975 =head2 put_doc
976
977 Add a document
978
979 $node->put_doc( $document_draft ) or die "can't add document";
980
981 Return true on success or false on failture.
982
983 =cut
984
985 sub put_doc {
986 my $self = shift;
987 my $doc = shift || return;
988 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
989 $self->shuttle_url( $self->{url} . '/put_doc',
990 'text/x-estraier-draft',
991 $doc->dump_draft,
992 undef
993 ) == 200;
994 }
995
996
997 =head2 out_doc
998
999 Remove a document
1000
1001 $node->out_doc( document_id ) or "can't remove document";
1002
1003 Return true on success or false on failture.
1004
1005 =cut
1006
1007 sub out_doc {
1008 my $self = shift;
1009 my $id = shift || return;
1010 return unless ($self->{url});
1011 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
1012 $self->shuttle_url( $self->{url} . '/out_doc',
1013 'application/x-www-form-urlencoded',
1014 "id=$id",
1015 undef
1016 ) == 200;
1017 }
1018
1019
1020 =head2 out_doc_by_uri
1021
1022 Remove a registrated document using it's uri
1023
1024 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1025
1026 Return true on success or false on failture.
1027
1028 =cut
1029
1030 sub out_doc_by_uri {
1031 my $self = shift;
1032 my $uri = shift || return;
1033 return unless ($self->{url});
1034 $self->shuttle_url( $self->{url} . '/out_doc',
1035 'application/x-www-form-urlencoded',
1036 "uri=" . uri_escape($uri),
1037 undef
1038 ) == 200;
1039 }
1040
1041
1042 =head2 edit_doc
1043
1044 Edit attributes of a document
1045
1046 $node->edit_doc( $document_draft ) or die "can't edit document";
1047
1048 Return true on success or false on failture.
1049
1050 =cut
1051
1052 sub edit_doc {
1053 my $self = shift;
1054 my $doc = shift || return;
1055 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1056 $self->shuttle_url( $self->{url} . '/edit_doc',
1057 'text/x-estraier-draft',
1058 $doc->dump_draft,
1059 undef
1060 ) == 200;
1061 }
1062
1063
1064 =head2 get_doc
1065
1066 Retreive document
1067
1068 my $doc = $node->get_doc( document_id ) or die "can't get document";
1069
1070 Return true on success or false on failture.
1071
1072 =cut
1073
1074 sub get_doc {
1075 my $self = shift;
1076 my $id = shift || return;
1077 return $self->_fetch_doc( id => $id );
1078 }
1079
1080
1081 =head2 get_doc_by_uri
1082
1083 Retreive document
1084
1085 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1086
1087 Return true on success or false on failture.
1088
1089 =cut
1090
1091 sub get_doc_by_uri {
1092 my $self = shift;
1093 my $uri = shift || return;
1094 return $self->_fetch_doc( uri => $uri );
1095 }
1096
1097
1098 =head2 get_doc_attr
1099
1100 Retrieve the value of an atribute from object
1101
1102 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1103 die "can't get document attribute";
1104
1105 =cut
1106
1107 sub get_doc_attr {
1108 my $self = shift;
1109 my ($id,$name) = @_;
1110 return unless ($id && $name);
1111 return $self->_fetch_doc( id => $id, attr => $name );
1112 }
1113
1114
1115 =head2 get_doc_attr_by_uri
1116
1117 Retrieve the value of an atribute from object
1118
1119 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1120 die "can't get document attribute";
1121
1122 =cut
1123
1124 sub get_doc_attr_by_uri {
1125 my $self = shift;
1126 my ($uri,$name) = @_;
1127 return unless ($uri && $name);
1128 return $self->_fetch_doc( uri => $uri, attr => $name );
1129 }
1130
1131
1132 =head2 etch_doc
1133
1134 Exctract document keywords
1135
1136 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1137
1138 =cut
1139
1140 sub etch_doc {
1141 my $self = shift;
1142 my $id = shift || return;
1143 return $self->_fetch_doc( id => $id, etch => 1 );
1144 }
1145
1146 =head2 etch_doc_by_uri
1147
1148 Retreive document
1149
1150 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1151
1152 Return true on success or false on failture.
1153
1154 =cut
1155
1156 sub etch_doc_by_uri {
1157 my $self = shift;
1158 my $uri = shift || return;
1159 return $self->_fetch_doc( uri => $uri, etch => 1 );
1160 }
1161
1162
1163 =head2 uri_to_id
1164
1165 Get ID of document specified by URI
1166
1167 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1168
1169 =cut
1170
1171 sub uri_to_id {
1172 my $self = shift;
1173 my $uri = shift || return;
1174 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1175 }
1176
1177
1178 =head2 _fetch_doc
1179
1180 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1181 C<etch_doc>, C<etch_doc_by_uri>.
1182
1183 # this will decode received draft into Search::Estraier::Document object
1184 my $doc = $node->_fetch_doc( id => 42 );
1185 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1186
1187 # to extract keywords, add etch
1188 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1189 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1190
1191 # to get document attrubute add attr
1192 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1193 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1194
1195 # more general form which allows implementation of
1196 # uri_to_id
1197 my $id = $node->_fetch_doc(
1198 uri => 'file:///document/uri/42',
1199 path => '/uri_to_id',
1200 chomp_resbody => 1
1201 );
1202
1203 =cut
1204
1205 sub _fetch_doc {
1206 my $self = shift;
1207 my $a = {@_};
1208 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1209
1210 my ($arg, $resbody);
1211
1212 my $path = $a->{path} || '/get_doc';
1213 $path = '/etch_doc' if ($a->{etch});
1214
1215 if ($a->{id}) {
1216 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1217 $arg = 'id=' . $a->{id};
1218 } elsif ($a->{uri}) {
1219 $arg = 'uri=' . uri_escape($a->{uri});
1220 } else {
1221 confess "unhandled argument. Need id or uri.";
1222 }
1223
1224 if ($a->{attr}) {
1225 $path = '/get_doc_attr';
1226 $arg .= '&attr=' . uri_escape($a->{attr});
1227 $a->{chomp_resbody} = 1;
1228 }
1229
1230 my $rv = $self->shuttle_url( $self->{url} . $path,
1231 'application/x-www-form-urlencoded',
1232 $arg,
1233 \$resbody,
1234 );
1235
1236 return if ($rv != 200);
1237
1238 if ($a->{etch}) {
1239 $self->{kwords} = {};
1240 return +{} unless ($resbody);
1241 foreach my $l (split(/\n/, $resbody)) {
1242 my ($k,$v) = split(/\t/, $l, 2);
1243 $self->{kwords}->{$k} = $v if ($v);
1244 }
1245 return $self->{kwords};
1246 } elsif ($a->{chomp_resbody}) {
1247 return unless (defined($resbody));
1248 chomp($resbody);
1249 return $resbody;
1250 } else {
1251 return new Search::Estraier::Document($resbody);
1252 }
1253 }
1254
1255
1256 =head2 name
1257
1258 my $node_name = $node->name;
1259
1260 =cut
1261
1262 sub name {
1263 my $self = shift;
1264 $self->_set_info unless ($self->{name});
1265 return $self->{name};
1266 }
1267
1268
1269 =head2 label
1270
1271 my $node_label = $node->label;
1272
1273 =cut
1274
1275 sub label {
1276 my $self = shift;
1277 $self->_set_info unless ($self->{label});
1278 return $self->{label};
1279 }
1280
1281
1282 =head2 doc_num
1283
1284 my $documents_in_node = $node->doc_num;
1285
1286 =cut
1287
1288 sub doc_num {
1289 my $self = shift;
1290 $self->_set_info if ($self->{dnum} < 0);
1291 return $self->{dnum};
1292 }
1293
1294
1295 =head2 word_num
1296
1297 my $words_in_node = $node->word_num;
1298
1299 =cut
1300
1301 sub word_num {
1302 my $self = shift;
1303 $self->_set_info if ($self->{wnum} < 0);
1304 return $self->{wnum};
1305 }
1306
1307
1308 =head2 size
1309
1310 my $node_size = $node->size;
1311
1312 =cut
1313
1314 sub size {
1315 my $self = shift;
1316 $self->_set_info if ($self->{size} < 0);
1317 return $self->{size};
1318 }
1319
1320
1321 =head2 search
1322
1323 Search documents which match condition
1324
1325 my $nres = $node->search( $cond, $depth );
1326
1327 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1328 depth for meta search.
1329
1330 Function results C<Search::Estraier::NodeResult> object.
1331
1332 =cut
1333
1334 sub search {
1335 my $self = shift;
1336 my ($cond, $depth) = @_;
1337 return unless ($cond && defined($depth) && $self->{url});
1338 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1339 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1340
1341 my $resbody;
1342
1343 my $rv = $self->shuttle_url( $self->{url} . '/search',
1344 'application/x-www-form-urlencoded',
1345 $self->cond_to_query( $cond, $depth ),
1346 \$resbody,
1347 );
1348 return if ($rv != 200);
1349
1350 my (@docs, $hints);
1351
1352 my @lines = split(/\n/, $resbody);
1353 return unless (@lines);
1354
1355 my $border = $lines[0];
1356 my $isend = 0;
1357 my $lnum = 1;
1358
1359 while ( $lnum <= $#lines ) {
1360 my $line = $lines[$lnum];
1361 $lnum++;
1362
1363 #warn "## $line\n";
1364 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1365 $isend = $1;
1366 last;
1367 }
1368
1369 if ($line =~ /\t/) {
1370 my ($k,$v) = split(/\t/, $line, 2);
1371 $hints->{$k} = $v;
1372 }
1373 }
1374
1375 my $snum = $lnum;
1376
1377 while( ! $isend && $lnum <= $#lines ) {
1378 my $line = $lines[$lnum];
1379 #warn "# $lnum: $line\n";
1380 $lnum++;
1381
1382 if ($line && $line =~ m/^\Q$border\E/) {
1383 if ($lnum > $snum) {
1384 my $rdattrs;
1385 my $rdvector;
1386 my $rdsnippet;
1387
1388 my $rlnum = $snum;
1389 while ($rlnum < $lnum - 1 ) {
1390 #my $rdline = $self->_s($lines[$rlnum]);
1391 my $rdline = $lines[$rlnum];
1392 $rlnum++;
1393 last unless ($rdline);
1394 if ($rdline =~ /^%/) {
1395 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1396 } elsif($rdline =~ /=/) {
1397 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1398 } else {
1399 confess "invalid format of response";
1400 }
1401 }
1402 while($rlnum < $lnum - 1) {
1403 my $rdline = $lines[$rlnum];
1404 $rlnum++;
1405 $rdsnippet .= "$rdline\n";
1406 }
1407 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1408 if (my $rduri = $rdattrs->{'@uri'}) {
1409 push @docs, new Search::Estraier::ResultDocument(
1410 uri => $rduri,
1411 attrs => $rdattrs,
1412 snippet => $rdsnippet,
1413 keywords => $rdvector,
1414 );
1415 }
1416 }
1417 $snum = $lnum;
1418 #warn "### $line\n";
1419 $isend = 1 if ($line =~ /:END$/);
1420 }
1421
1422 }
1423
1424 if (! $isend) {
1425 warn "received result doesn't have :END\n$resbody";
1426 return;
1427 }
1428
1429 #warn Dumper(\@docs, $hints);
1430
1431 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1432 }
1433
1434
1435 =head2 cond_to_query
1436
1437 Return URI encoded string generated from Search::Estraier::Condition
1438
1439 my $args = $node->cond_to_query( $cond, $depth );
1440
1441 =cut
1442
1443 sub cond_to_query {
1444 my $self = shift;
1445
1446 my $cond = shift || return;
1447 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1448 my $depth = shift;
1449
1450 my @args;
1451
1452 if (my $phrase = $cond->phrase) {
1453 push @args, 'phrase=' . uri_escape($phrase);
1454 }
1455
1456 if (my @attrs = $cond->attrs) {
1457 for my $i ( 0 .. $#attrs ) {
1458 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1459 }
1460 }
1461
1462 if (my $order = $cond->order) {
1463 push @args, 'order=' . uri_escape($order);
1464 }
1465
1466 if (my $max = $cond->max) {
1467 push @args, 'max=' . $max;
1468 } else {
1469 push @args, 'max=' . (1 << 30);
1470 }
1471
1472 if (my $options = $cond->options) {
1473 push @args, 'options=' . $options;
1474 }
1475
1476 push @args, 'depth=' . $depth if ($depth);
1477 push @args, 'wwidth=' . $self->{wwidth};
1478 push @args, 'hwidth=' . $self->{hwidth};
1479 push @args, 'awidth=' . $self->{awidth};
1480
1481 return join('&', @args);
1482 }
1483
1484
1485 =head2 shuttle_url
1486
1487 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1488 master.
1489
1490 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1491
1492 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1493 body will be saved within object.
1494
1495 =cut
1496
1497 use LWP::UserAgent;
1498
1499 sub shuttle_url {
1500 my $self = shift;
1501
1502 my ($url, $content_type, $reqbody, $resbody) = @_;
1503
1504 $self->{status} = -1;
1505
1506 warn "## $url\n" if ($self->{debug});
1507
1508 $url = new URI($url);
1509 if (
1510 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1511 !$url->host || !$url->port || $url->port < 1
1512 ) {
1513 carp "can't parse $url\n";
1514 return -1;
1515 }
1516
1517 my $ua = LWP::UserAgent->new;
1518 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1519
1520 my $req;
1521 if ($reqbody) {
1522 $req = HTTP::Request->new(POST => $url);
1523 } else {
1524 $req = HTTP::Request->new(GET => $url);
1525 }
1526
1527 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1528 $req->headers->header( 'Connection', 'close' );
1529 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1530 $req->content_type( $content_type );
1531
1532 warn $req->headers->as_string,"\n" if ($self->{debug});
1533
1534 if ($reqbody) {
1535 warn "$reqbody\n" if ($self->{debug});
1536 $req->content( $reqbody );
1537 }
1538
1539 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1540
1541 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1542
1543 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1544
1545 if (! $res->is_success) {
1546 if ($self->{croak_on_error}) {
1547 croak("can't get $url: ",$res->status_line);
1548 } else {
1549 return -1;
1550 }
1551 }
1552
1553 $$resbody .= $res->content;
1554
1555 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1556
1557 return $self->{status};
1558 }
1559
1560
1561 =head2 set_snippet_width
1562
1563 Set width of snippets in results
1564
1565 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1566
1567 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1568 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1569
1570 C<$hwidth> specified width of strings from beginning of string. Default
1571 value is C<96>. Negative or zero value keep previous value.
1572
1573 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1574 If negative of zero value is provided previous value is kept unchanged.
1575
1576 =cut
1577
1578 sub set_snippet_width {
1579 my $self = shift;
1580
1581 my ($wwidth, $hwidth, $awidth) = @_;
1582 $self->{wwidth} = $wwidth;
1583 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1584 $self->{awidth} = $awidth if ($awidth >= 0);
1585 }
1586
1587
1588 =head2 set_user
1589
1590 Manage users of node
1591
1592 $node->set_user( 'name', $mode );
1593
1594 C<$mode> can be one of:
1595
1596 =over 4
1597
1598 =item 0
1599
1600 delete account
1601
1602 =item 1
1603
1604 set administrative right for user
1605
1606 =item 2
1607
1608 set user account as guest
1609
1610 =back
1611
1612 Return true on success, otherwise false.
1613
1614 =cut
1615
1616 sub set_user {
1617 my $self = shift;
1618 my ($name, $mode) = @_;
1619
1620 return unless ($self->{url});
1621 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1622
1623 $self->shuttle_url( $self->{url} . '/_set_user',
1624 'text/plain',
1625 'name=' . uri_escape($name) . '&mode=' . $mode,
1626 undef
1627 ) == 200;
1628 }
1629
1630
1631 =head2 set_link
1632
1633 Manage node links
1634
1635 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1636
1637 If C<$credit> is negative, link is removed.
1638
1639 =cut
1640
1641 sub set_link {
1642 my $self = shift;
1643 my ($url, $label, $credit) = @_;
1644
1645 return unless ($self->{url});
1646 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1647
1648 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1649 $reqbody .= '&credit=' . $credit if ($credit > 0);
1650
1651 $self->shuttle_url( $self->{url} . '/_set_link',
1652 'application/x-www-form-urlencoded',
1653 $reqbody,
1654 undef
1655 ) == 200;
1656 }
1657
1658
1659 =head1 PRIVATE METHODS
1660
1661 You could call those directly, but you don't have to. I hope.
1662
1663 =head2 _set_info
1664
1665 Set information for node
1666
1667 $node->_set_info;
1668
1669 =cut
1670
1671 sub _set_info {
1672 my $self = shift;
1673
1674 $self->{status} = -1;
1675 return unless ($self->{url});
1676
1677 my $resbody;
1678 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1679 'text/plain',
1680 undef,
1681 \$resbody,
1682 );
1683
1684 return if ($rv != 200 || !$resbody);
1685
1686 # it seems that response can have multiple line endings
1687 $resbody =~ s/[\r\n]+$//;
1688
1689 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1690 split(/\t/, $resbody, 5);
1691
1692 }
1693
1694 ###
1695
1696 =head1 EXPORT
1697
1698 Nothing.
1699
1700 =head1 SEE ALSO
1701
1702 L<http://hyperestraier.sourceforge.net/>
1703
1704 Hyper Estraier Ruby interface on which this module is based.
1705
1706 =head1 AUTHOR
1707
1708 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1709
1710
1711 =head1 COPYRIGHT AND LICENSE
1712
1713 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1714
1715 This library is free software; you can redistribute it and/or modify
1716 it under the GPL v2 or later.
1717
1718 =cut
1719
1720 1;

  ViewVC Help
Powered by ViewVC 1.1.26