/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (show annotations)
Mon Jan 9 15:28:24 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 28797 byte(s)
0.03 final
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.03';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 if (defined($nres)) {
54 # for each document in results
55 for my $i ( 0 ... $nres->doc_num - 1 ) {
56 # get result document
57 my $rdoc = $nres->get_doc($i);
58 # display attribte
59 print "URI: ", $rdoc->attr('@uri'),"\n";
60 print "Title: ", $rdoc->attr('@title'),"\n";
61 print $rdoc->snippet,"\n";
62 }
63 } else {
64 die "error: ", $node->status,"\n";
65 }
66
67 =head1 DESCRIPTION
68
69 This module is implementation of node API of Hyper Estraier. Since it's
70 perl-only module with dependencies only on standard perl modules, it will
71 run on all platforms on which perl runs. It doesn't require compilation
72 or Hyper Estraier development files on target machine.
73
74 It is implemented as multiple packages which closly resamble Ruby
75 implementation. It also includes methods to manage nodes.
76
77 There are few examples in C<scripts> directory of this distribution.
78
79 =cut
80
81 =head1 Inheritable common methods
82
83 This methods should really move somewhere else.
84
85 =head2 _s
86
87 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88
89 my $text = $self->_s(" this is a text ");
90 $text = 'this is a text';
91
92 =cut
93
94 sub _s {
95 my $text = $_[1] || return;
96 $text =~ s/\s\s+/ /gs;
97 $text =~ s/^\s+//;
98 $text =~ s/\s+$//;
99 return $text;
100 }
101
102 package Search::Estraier::Document;
103
104 use Carp qw/croak confess/;
105
106 use Search::Estraier;
107 our @ISA = qw/Search::Estraier/;
108
109 =head1 Search::Estraier::Document
110
111 This class implements Document which is collection of attributes
112 (key=value), vectors (also key value) display text and hidden text.
113
114
115 =head2 new
116
117 Create new document, empty or from draft.
118
119 my $doc = new Search::HyperEstraier::Document;
120 my $doc2 = new Search::HyperEstraier::Document( $draft );
121
122 =cut
123
124 sub new {
125 my $class = shift;
126 my $self = {};
127 bless($self, $class);
128
129 $self->{id} = -1;
130
131 my $draft = shift;
132
133 if ($draft) {
134 my $in_text = 0;
135 foreach my $line (split(/\n/, $draft)) {
136
137 if ($in_text) {
138 if ($line =~ /^\t/) {
139 push @{ $self->{htexts} }, substr($line, 1);
140 } else {
141 push @{ $self->{dtexts} }, $line;
142 }
143 next;
144 }
145
146 if ($line =~ m/^%VECTOR\t(.+)$/) {
147 my @fields = split(/\t/, $1);
148 for my $i ( 0 .. ($#fields - 1) ) {
149 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
150 $i++;
151 }
152 next;
153 } elsif ($line =~ m/^%/) {
154 # What is this? comment?
155 #warn "$line\n";
156 next;
157 } elsif ($line =~ m/^$/) {
158 $in_text = 1;
159 next;
160 } elsif ($line =~ m/^(.+)=(.+)$/) {
161 $self->{attrs}->{ $1 } = $2;
162 next;
163 }
164
165 warn "draft ignored: $line\n";
166 }
167 }
168
169 $self ? return $self : return undef;
170 }
171
172
173 =head2 add_attr
174
175 Add an attribute.
176
177 $doc->add_attr( name => 'value' );
178
179 Delete attribute using
180
181 $doc->add_attr( name => undef );
182
183 =cut
184
185 sub add_attr {
186 my $self = shift;
187 my $attrs = {@_};
188
189 while (my ($name, $value) = each %{ $attrs }) {
190 if (! defined($value)) {
191 delete( $self->{attrs}->{ $self->_s($name) } );
192 } else {
193 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
194 }
195 }
196
197 return 1;
198 }
199
200
201 =head2 add_text
202
203 Add a sentence of text.
204
205 $doc->add_text('this is example text to display');
206
207 =cut
208
209 sub add_text {
210 my $self = shift;
211 my $text = shift;
212 return unless defined($text);
213
214 push @{ $self->{dtexts} }, $self->_s($text);
215 }
216
217
218 =head2 add_hidden_text
219
220 Add a hidden sentence.
221
222 $doc->add_hidden_text('this is example text just for search');
223
224 =cut
225
226 sub add_hidden_text {
227 my $self = shift;
228 my $text = shift;
229 return unless defined($text);
230
231 push @{ $self->{htexts} }, $self->_s($text);
232 }
233
234
235 =head2 id
236
237 Get the ID number of document. If the object has never been registred, C<-1> is returned.
238
239 print $doc->id;
240
241 =cut
242
243 sub id {
244 my $self = shift;
245 return $self->{id};
246 }
247
248
249 =head2 attr_names
250
251 Returns array with attribute names from document object.
252
253 my @attrs = $doc->attr_names;
254
255 =cut
256
257 sub attr_names {
258 my $self = shift;
259 return unless ($self->{attrs});
260 #croak "attr_names return array, not scalar" if (! wantarray);
261 return sort keys %{ $self->{attrs} };
262 }
263
264
265 =head2 attr
266
267 Returns value of an attribute.
268
269 my $value = $doc->attr( 'attribute' );
270
271 =cut
272
273 sub attr {
274 my $self = shift;
275 my $name = shift;
276 return unless (defined($name) && $self->{attrs});
277 return $self->{attrs}->{ $name };
278 }
279
280
281 =head2 texts
282
283 Returns array with text sentences.
284
285 my @texts = $doc->texts;
286
287 =cut
288
289 sub texts {
290 my $self = shift;
291 #confess "texts return array, not scalar" if (! wantarray);
292 return @{ $self->{dtexts} } if ($self->{dtexts});
293 }
294
295
296 =head2 cat_texts
297
298 Return whole text as single scalar.
299
300 my $text = $doc->cat_texts;
301
302 =cut
303
304 sub cat_texts {
305 my $self = shift;
306 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
307 }
308
309
310 =head2 dump_draft
311
312 Dump draft data from document object.
313
314 print $doc->dump_draft;
315
316 =cut
317
318 sub dump_draft {
319 my $self = shift;
320 my $draft;
321
322 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
323 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
324 }
325
326 if ($self->{kwords}) {
327 $draft .= '%%VECTOR';
328 while (my ($key, $value) = each %{ $self->{kwords} }) {
329 $draft .= "\t$key\t$value";
330 }
331 $draft .= "\n";
332 }
333
334 $draft .= "\n";
335
336 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
337 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
338
339 return $draft;
340 }
341
342
343 =head2 delete
344
345 Empty document object
346
347 $doc->delete;
348
349 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
350 convinience. Document objects which go out of scope will be destroyed
351 automatically.
352
353 =cut
354
355 sub delete {
356 my $self = shift;
357
358 foreach my $data (qw/attrs dtexts stexts kwords/) {
359 delete($self->{$data});
360 }
361
362 $self->{id} = -1;
363
364 return 1;
365 }
366
367
368
369 package Search::Estraier::Condition;
370
371 use Carp qw/confess croak/;
372
373 use Search::Estraier;
374 our @ISA = qw/Search::Estraier/;
375
376 =head1 Search::Estraier::Condition
377
378 =head2 new
379
380 my $cond = new Search::HyperEstraier::Condition;
381
382 =cut
383
384 sub new {
385 my $class = shift;
386 my $self = {};
387 bless($self, $class);
388
389 $self->{max} = -1;
390 $self->{options} = 0;
391
392 $self ? return $self : return undef;
393 }
394
395
396 =head2 set_phrase
397
398 $cond->set_phrase('search phrase');
399
400 =cut
401
402 sub set_phrase {
403 my $self = shift;
404 $self->{phrase} = $self->_s( shift );
405 }
406
407
408 =head2 add_attr
409
410 $cond->add_attr('@URI STRINC /~dpavlin/');
411
412 =cut
413
414 sub add_attr {
415 my $self = shift;
416 my $attr = shift || return;
417 push @{ $self->{attrs} }, $self->_s( $attr );
418 }
419
420
421 =head2 set_order
422
423 $cond->set_order('@mdate NUMD');
424
425 =cut
426
427 sub set_order {
428 my $self = shift;
429 $self->{order} = shift;
430 }
431
432
433 =head2 set_max
434
435 $cond->set_max(42);
436
437 =cut
438
439 sub set_max {
440 my $self = shift;
441 my $max = shift;
442 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
443 $self->{max} = $max;
444 }
445
446
447 =head2 set_options
448
449 $cond->set_options( SURE => 1 );
450
451 =cut
452
453 my $options = {
454 # check N-gram keys skipping by three
455 SURE => 1 << 0,
456 # check N-gram keys skipping by two
457 USUAL => 1 << 1,
458 # without TF-IDF tuning
459 FAST => 1 << 2,
460 # with the simplified phrase
461 AGITO => 1 << 3,
462 # check every N-gram key
463 NOIDF => 1 << 4,
464 # check N-gram keys skipping by one
465 SIMPLE => 1 << 10,
466 };
467
468 sub set_options {
469 my $self = shift;
470 my $option = shift;
471 confess "unknown option" unless ($options->{$option});
472 $self->{options} ||= $options->{$option};
473 }
474
475
476 =head2 phrase
477
478 Return search phrase.
479
480 print $cond->phrase;
481
482 =cut
483
484 sub phrase {
485 my $self = shift;
486 return $self->{phrase};
487 }
488
489
490 =head2 order
491
492 Return search result order.
493
494 print $cond->order;
495
496 =cut
497
498 sub order {
499 my $self = shift;
500 return $self->{order};
501 }
502
503
504 =head2 attrs
505
506 Return search result attrs.
507
508 my @cond_attrs = $cond->attrs;
509
510 =cut
511
512 sub attrs {
513 my $self = shift;
514 #croak "attrs return array, not scalar" if (! wantarray);
515 return @{ $self->{attrs} } if ($self->{attrs});
516 }
517
518
519 =head2 max
520
521 Return maximum number of results.
522
523 print $cond->max;
524
525 C<-1> is returned for unitialized value, C<0> is unlimited.
526
527 =cut
528
529 sub max {
530 my $self = shift;
531 return $self->{max};
532 }
533
534
535 =head2 options
536
537 Return options for this condition.
538
539 print $cond->options;
540
541 Options are returned in numerical form.
542
543 =cut
544
545 sub options {
546 my $self = shift;
547 return $self->{options};
548 }
549
550
551 package Search::Estraier::ResultDocument;
552
553 use Carp qw/croak/;
554
555 #use Search::Estraier;
556 #our @ISA = qw/Search::Estraier/;
557
558 =head1 Search::Estraier::ResultDocument
559
560 =head2 new
561
562 my $rdoc = new Search::HyperEstraier::ResultDocument(
563 uri => 'http://localhost/document/uri/42',
564 attrs => {
565 foo => 1,
566 bar => 2,
567 },
568 snippet => 'this is a text of snippet'
569 keywords => 'this\tare\tkeywords'
570 );
571
572 =cut
573
574 sub new {
575 my $class = shift;
576 my $self = {@_};
577 bless($self, $class);
578
579 croak "missing uri for ResultDocument" unless defined($self->{uri});
580
581 $self ? return $self : return undef;
582 }
583
584
585 =head2 uri
586
587 Return URI of result document
588
589 print $rdoc->uri;
590
591 =cut
592
593 sub uri {
594 my $self = shift;
595 return $self->{uri};
596 }
597
598
599 =head2 attr_names
600
601 Returns array with attribute names from result document object.
602
603 my @attrs = $rdoc->attr_names;
604
605 =cut
606
607 sub attr_names {
608 my $self = shift;
609 croak "attr_names return array, not scalar" if (! wantarray);
610 return sort keys %{ $self->{attrs} };
611 }
612
613
614 =head2 attr
615
616 Returns value of an attribute.
617
618 my $value = $rdoc->attr( 'attribute' );
619
620 =cut
621
622 sub attr {
623 my $self = shift;
624 my $name = shift || return;
625 return $self->{attrs}->{ $name };
626 }
627
628
629 =head2 snippet
630
631 Return snippet from result document
632
633 print $rdoc->snippet;
634
635 =cut
636
637 sub snippet {
638 my $self = shift;
639 return $self->{snippet};
640 }
641
642
643 =head2 keywords
644
645 Return keywords from result document
646
647 print $rdoc->keywords;
648
649 =cut
650
651 sub keywords {
652 my $self = shift;
653 return $self->{keywords};
654 }
655
656
657 package Search::Estraier::NodeResult;
658
659 use Carp qw/croak/;
660
661 #use Search::Estraier;
662 #our @ISA = qw/Search::Estraier/;
663
664 =head1 Search::Estraier::NodeResult
665
666 =head2 new
667
668 my $res = new Search::HyperEstraier::NodeResult(
669 docs => @array_of_rdocs,
670 hits => %hash_with_hints,
671 );
672
673 =cut
674
675 sub new {
676 my $class = shift;
677 my $self = {@_};
678 bless($self, $class);
679
680 foreach my $f (qw/docs hints/) {
681 croak "missing $f for ResultDocument" unless defined($self->{$f});
682 }
683
684 $self ? return $self : return undef;
685 }
686
687
688 =head2 doc_num
689
690 Return number of documents
691
692 print $res->doc_num;
693
694 =cut
695
696 sub doc_num {
697 my $self = shift;
698 return $#{$self->{docs}} + 1;
699 }
700
701
702 =head2 get_doc
703
704 Return single document
705
706 my $doc = $res->get_doc( 42 );
707
708 Returns undef if document doesn't exist.
709
710 =cut
711
712 sub get_doc {
713 my $self = shift;
714 my $num = shift;
715 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
716 return undef if ($num < 0 || $num > $self->{docs});
717 return $self->{docs}->[$num];
718 }
719
720
721 =head2 hint
722
723 Return specific hint from results.
724
725 print $rec->hint( 'VERSION' );
726
727 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
728 C<TIME>, C<LINK#n>, C<VIEW>.
729
730 =cut
731
732 sub hint {
733 my $self = shift;
734 my $key = shift || return;
735 return $self->{hints}->{$key};
736 }
737
738
739 package Search::Estraier::Node;
740
741 use Carp qw/carp croak confess/;
742 use URI;
743 use MIME::Base64;
744 use IO::Socket::INET;
745 use URI::Escape qw/uri_escape/;
746
747 =head1 Search::Estraier::Node
748
749 =head2 new
750
751 my $node = new Search::HyperEstraier::Node;
752
753 or optionally with C<url> as parametar
754
755 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
756
757 =cut
758
759 sub new {
760 my $class = shift;
761 my $self = {
762 pxport => -1,
763 timeout => 0, # this used to be -1
764 dnum => -1,
765 wnum => -1,
766 size => -1.0,
767 wwidth => 480,
768 hwidth => 96,
769 awidth => 96,
770 status => -1,
771 };
772 bless($self, $class);
773
774 if ($#_ == 0) {
775 $self->{url} = shift;
776 } else {
777 my $args = {@_};
778
779 $self->{debug} = $args->{debug};
780 warn "## Node debug on\n" if ($self->{debug});
781 }
782
783 $self ? return $self : return undef;
784 }
785
786
787 =head2 set_url
788
789 Specify URL to node server
790
791 $node->set_url('http://localhost:1978');
792
793 =cut
794
795 sub set_url {
796 my $self = shift;
797 $self->{url} = shift;
798 }
799
800
801 =head2 set_proxy
802
803 Specify proxy server to connect to node server
804
805 $node->set_proxy('proxy.example.com', 8080);
806
807 =cut
808
809 sub set_proxy {
810 my $self = shift;
811 my ($host,$port) = @_;
812 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
813 $self->{pxhost} = $host;
814 $self->{pxport} = $port;
815 }
816
817
818 =head2 set_timeout
819
820 Specify timeout of connection in seconds
821
822 $node->set_timeout( 15 );
823
824 =cut
825
826 sub set_timeout {
827 my $self = shift;
828 my $sec = shift;
829 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
830 $self->{timeout} = $sec;
831 }
832
833
834 =head2 set_auth
835
836 Specify name and password for authentication to node server.
837
838 $node->set_auth('clint','eastwood');
839
840 =cut
841
842 sub set_auth {
843 my $self = shift;
844 my ($login,$passwd) = @_;
845 my $basic_auth = encode_base64( "$login:$passwd" );
846 chomp($basic_auth);
847 $self->{auth} = $basic_auth;
848 }
849
850
851 =head2 status
852
853 Return status code of last request.
854
855 print $node->status;
856
857 C<-1> means connection failure.
858
859 =cut
860
861 sub status {
862 my $self = shift;
863 return $self->{status};
864 }
865
866
867 =head2 put_doc
868
869 Add a document
870
871 $node->put_doc( $document_draft ) or die "can't add document";
872
873 Return true on success or false on failture.
874
875 =cut
876
877 sub put_doc {
878 my $self = shift;
879 my $doc = shift || return;
880 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
881 $self->shuttle_url( $self->{url} . '/put_doc',
882 'text/x-estraier-draft',
883 $doc->dump_draft,
884 undef
885 ) == 200;
886 }
887
888
889 =head2 out_doc
890
891 Remove a document
892
893 $node->out_doc( document_id ) or "can't remove document";
894
895 Return true on success or false on failture.
896
897 =cut
898
899 sub out_doc {
900 my $self = shift;
901 my $id = shift || return;
902 return unless ($self->{url});
903 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
904 $self->shuttle_url( $self->{url} . '/out_doc',
905 'application/x-www-form-urlencoded',
906 "id=$id",
907 undef
908 ) == 200;
909 }
910
911
912 =head2 out_doc_by_uri
913
914 Remove a registrated document using it's uri
915
916 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
917
918 Return true on success or false on failture.
919
920 =cut
921
922 sub out_doc_by_uri {
923 my $self = shift;
924 my $uri = shift || return;
925 return unless ($self->{url});
926 $self->shuttle_url( $self->{url} . '/out_doc',
927 'application/x-www-form-urlencoded',
928 "uri=" . uri_escape($uri),
929 undef
930 ) == 200;
931 }
932
933
934 =head2 edit_doc
935
936 Edit attributes of a document
937
938 $node->edit_doc( $document_draft ) or die "can't edit document";
939
940 Return true on success or false on failture.
941
942 =cut
943
944 sub edit_doc {
945 my $self = shift;
946 my $doc = shift || return;
947 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
948 $self->shuttle_url( $self->{url} . '/edit_doc',
949 'text/x-estraier-draft',
950 $doc->dump_draft,
951 undef
952 ) == 200;
953 }
954
955
956 =head2 get_doc
957
958 Retreive document
959
960 my $doc = $node->get_doc( document_id ) or die "can't get document";
961
962 Return true on success or false on failture.
963
964 =cut
965
966 sub get_doc {
967 my $self = shift;
968 my $id = shift || return;
969 return $self->_fetch_doc( id => $id );
970 }
971
972
973 =head2 get_doc_by_uri
974
975 Retreive document
976
977 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
978
979 Return true on success or false on failture.
980
981 =cut
982
983 sub get_doc_by_uri {
984 my $self = shift;
985 my $uri = shift || return;
986 return $self->_fetch_doc( uri => $uri );
987 }
988
989
990 =head2 get_doc_attr
991
992 Retrieve the value of an atribute from object
993
994 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
995 die "can't get document attribute";
996
997 =cut
998
999 sub get_doc_attr {
1000 my $self = shift;
1001 my ($id,$name) = @_;
1002 return unless ($id && $name);
1003 return $self->_fetch_doc( id => $id, attr => $name );
1004 }
1005
1006
1007 =head2 get_doc_attr_by_uri
1008
1009 Retrieve the value of an atribute from object
1010
1011 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1012 die "can't get document attribute";
1013
1014 =cut
1015
1016 sub get_doc_attr_by_uri {
1017 my $self = shift;
1018 my ($uri,$name) = @_;
1019 return unless ($uri && $name);
1020 return $self->_fetch_doc( uri => $uri, attr => $name );
1021 }
1022
1023
1024 =head2 etch_doc
1025
1026 Exctract document keywords
1027
1028 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1029
1030 =cut
1031
1032 sub etch_doc {
1033 my $self = shift;
1034 my $id = shift || return;
1035 return $self->_fetch_doc( id => $id, etch => 1 );
1036 }
1037
1038 =head2 etch_doc_by_uri
1039
1040 Retreive document
1041
1042 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1043
1044 Return true on success or false on failture.
1045
1046 =cut
1047
1048 sub etch_doc_by_uri {
1049 my $self = shift;
1050 my $uri = shift || return;
1051 return $self->_fetch_doc( uri => $uri, etch => 1 );
1052 }
1053
1054
1055 =head2 uri_to_id
1056
1057 Get ID of document specified by URI
1058
1059 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1060
1061 =cut
1062
1063 sub uri_to_id {
1064 my $self = shift;
1065 my $uri = shift || return;
1066 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1067 }
1068
1069
1070 =head2 _fetch_doc
1071
1072 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1073 C<etch_doc>, C<etch_doc_by_uri>.
1074
1075 # this will decode received draft into Search::Estraier::Document object
1076 my $doc = $node->_fetch_doc( id => 42 );
1077 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1078
1079 # to extract keywords, add etch
1080 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1081 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1082
1083 # to get document attrubute add attr
1084 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1085 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1086
1087 # more general form which allows implementation of
1088 # uri_to_id
1089 my $id = $node->_fetch_doc(
1090 uri => 'file:///document/uri/42',
1091 path => '/uri_to_id',
1092 chomp_resbody => 1
1093 );
1094
1095 =cut
1096
1097 sub _fetch_doc {
1098 my $self = shift;
1099 my $a = {@_};
1100 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1101
1102 my ($arg, $resbody);
1103
1104 my $path = $a->{path} || '/get_doc';
1105 $path = '/etch_doc' if ($a->{etch});
1106
1107 if ($a->{id}) {
1108 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1109 $arg = 'id=' . $a->{id};
1110 } elsif ($a->{uri}) {
1111 $arg = 'uri=' . uri_escape($a->{uri});
1112 } else {
1113 confess "unhandled argument. Need id or uri.";
1114 }
1115
1116 if ($a->{attr}) {
1117 $path = '/get_doc_attr';
1118 $arg .= '&attr=' . uri_escape($a->{attr});
1119 $a->{chomp_resbody} = 1;
1120 }
1121
1122 my $rv = $self->shuttle_url( $self->{url} . $path,
1123 'application/x-www-form-urlencoded',
1124 $arg,
1125 \$resbody,
1126 );
1127
1128 return if ($rv != 200);
1129
1130 if ($a->{etch}) {
1131 $self->{kwords} = {};
1132 return +{} unless ($resbody);
1133 foreach my $l (split(/\n/, $resbody)) {
1134 my ($k,$v) = split(/\t/, $l, 2);
1135 $self->{kwords}->{$k} = $v if ($v);
1136 }
1137 return $self->{kwords};
1138 } elsif ($a->{chomp_resbody}) {
1139 return unless (defined($resbody));
1140 chomp($resbody);
1141 return $resbody;
1142 } else {
1143 return new Search::Estraier::Document($resbody);
1144 }
1145 }
1146
1147
1148 =head2 name
1149
1150 my $node_name = $node->name;
1151
1152 =cut
1153
1154 sub name {
1155 my $self = shift;
1156 $self->_set_info unless ($self->{name});
1157 return $self->{name};
1158 }
1159
1160
1161 =head2 label
1162
1163 my $node_label = $node->label;
1164
1165 =cut
1166
1167 sub label {
1168 my $self = shift;
1169 $self->_set_info unless ($self->{label});
1170 return $self->{label};
1171 }
1172
1173
1174 =head2 doc_num
1175
1176 my $documents_in_node = $node->doc_num;
1177
1178 =cut
1179
1180 sub doc_num {
1181 my $self = shift;
1182 $self->_set_info if ($self->{dnum} < 0);
1183 return $self->{dnum};
1184 }
1185
1186
1187 =head2 word_num
1188
1189 my $words_in_node = $node->word_num;
1190
1191 =cut
1192
1193 sub word_num {
1194 my $self = shift;
1195 $self->_set_info if ($self->{wnum} < 0);
1196 return $self->{wnum};
1197 }
1198
1199
1200 =head2 size
1201
1202 my $node_size = $node->size;
1203
1204 =cut
1205
1206 sub size {
1207 my $self = shift;
1208 $self->_set_info if ($self->{size} < 0);
1209 return $self->{size};
1210 }
1211
1212
1213 =head2 search
1214
1215 Search documents which match condition
1216
1217 my $nres = $node->search( $cond, $depth );
1218
1219 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1220 depth for meta search.
1221
1222 Function results C<Search::Estraier::NodeResult> object.
1223
1224 =cut
1225
1226 sub search {
1227 my $self = shift;
1228 my ($cond, $depth) = @_;
1229 return unless ($cond && defined($depth) && $self->{url});
1230 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1231 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1232
1233 my $resbody;
1234
1235 my $rv = $self->shuttle_url( $self->{url} . '/search',
1236 'application/x-www-form-urlencoded',
1237 $self->cond_to_query( $cond, $depth ),
1238 \$resbody,
1239 );
1240 return if ($rv != 200);
1241
1242 my (@docs, $hints);
1243
1244 my @lines = split(/\n/, $resbody);
1245 return unless (@lines);
1246
1247 my $border = $lines[0];
1248 my $isend = 0;
1249 my $lnum = 1;
1250
1251 while ( $lnum <= $#lines ) {
1252 my $line = $lines[$lnum];
1253 $lnum++;
1254
1255 #warn "## $line\n";
1256 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1257 $isend = $1;
1258 last;
1259 }
1260
1261 if ($line =~ /\t/) {
1262 my ($k,$v) = split(/\t/, $line, 2);
1263 $hints->{$k} = $v;
1264 }
1265 }
1266
1267 my $snum = $lnum;
1268
1269 while( ! $isend && $lnum <= $#lines ) {
1270 my $line = $lines[$lnum];
1271 #warn "# $lnum: $line\n";
1272 $lnum++;
1273
1274 if ($line && $line =~ m/^\Q$border\E/) {
1275 if ($lnum > $snum) {
1276 my $rdattrs;
1277 my $rdvector;
1278 my $rdsnippet;
1279
1280 my $rlnum = $snum;
1281 while ($rlnum < $lnum - 1 ) {
1282 #my $rdline = $self->_s($lines[$rlnum]);
1283 my $rdline = $lines[$rlnum];
1284 $rlnum++;
1285 last unless ($rdline);
1286 if ($rdline =~ /^%/) {
1287 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1288 } elsif($rdline =~ /=/) {
1289 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1290 } else {
1291 confess "invalid format of response";
1292 }
1293 }
1294 while($rlnum < $lnum - 1) {
1295 my $rdline = $lines[$rlnum];
1296 $rlnum++;
1297 $rdsnippet .= "$rdline\n";
1298 }
1299 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1300 if (my $rduri = $rdattrs->{'@uri'}) {
1301 push @docs, new Search::Estraier::ResultDocument(
1302 uri => $rduri,
1303 attrs => $rdattrs,
1304 snippet => $rdsnippet,
1305 keywords => $rdvector,
1306 );
1307 }
1308 }
1309 $snum = $lnum;
1310 #warn "### $line\n";
1311 $isend = 1 if ($line =~ /:END$/);
1312 }
1313
1314 }
1315
1316 if (! $isend) {
1317 warn "received result doesn't have :END\n$resbody";
1318 return;
1319 }
1320
1321 #warn Dumper(\@docs, $hints);
1322
1323 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1324 }
1325
1326
1327 =head2 cond_to_query
1328
1329 Return URI encoded string generated from Search::Estraier::Condition
1330
1331 my $args = $node->cond_to_query( $cond, $depth );
1332
1333 =cut
1334
1335 sub cond_to_query {
1336 my $self = shift;
1337
1338 my $cond = shift || return;
1339 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1340 my $depth = shift;
1341
1342 my @args;
1343
1344 if (my $phrase = $cond->phrase) {
1345 push @args, 'phrase=' . uri_escape($phrase);
1346 }
1347
1348 if (my @attrs = $cond->attrs) {
1349 for my $i ( 0 .. $#attrs ) {
1350 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1351 }
1352 }
1353
1354 if (my $order = $cond->order) {
1355 push @args, 'order=' . uri_escape($order);
1356 }
1357
1358 if (my $max = $cond->max) {
1359 push @args, 'max=' . $max;
1360 } else {
1361 push @args, 'max=' . (1 << 30);
1362 }
1363
1364 if (my $options = $cond->options) {
1365 push @args, 'options=' . $options;
1366 }
1367
1368 push @args, 'depth=' . $depth if ($depth);
1369 push @args, 'wwidth=' . $self->{wwidth};
1370 push @args, 'hwidth=' . $self->{hwidth};
1371 push @args, 'awidth=' . $self->{awidth};
1372
1373 return join('&', @args);
1374 }
1375
1376
1377 =head2 shuttle_url
1378
1379 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1380 master.
1381
1382 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1383
1384 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1385 body will be saved within object.
1386
1387 =cut
1388
1389 use LWP::UserAgent;
1390
1391 sub shuttle_url {
1392 my $self = shift;
1393
1394 my ($url, $content_type, $reqbody, $resbody) = @_;
1395
1396 $self->{status} = -1;
1397
1398 warn "## $url\n" if ($self->{debug});
1399
1400 $url = new URI($url);
1401 if (
1402 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1403 !$url->host || !$url->port || $url->port < 1
1404 ) {
1405 carp "can't parse $url\n";
1406 return -1;
1407 }
1408
1409 my $ua = LWP::UserAgent->new;
1410 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1411
1412 my $req;
1413 if ($reqbody) {
1414 $req = HTTP::Request->new(POST => $url);
1415 } else {
1416 $req = HTTP::Request->new(GET => $url);
1417 }
1418
1419 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1420 $req->headers->header( 'Connection', 'close' );
1421 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );
1422 $req->content_type( $content_type );
1423
1424 warn $req->headers->as_string,"\n" if ($self->{debug});
1425
1426 if ($reqbody) {
1427 warn "$reqbody\n" if ($self->{debug});
1428 $req->content( $reqbody );
1429 }
1430
1431 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1432
1433 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1434
1435 return -1 if (! $res->is_success);
1436
1437 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1438
1439 $$resbody .= $res->content;
1440
1441 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1442
1443 return $self->{status};
1444 }
1445
1446
1447 =head2 set_snippet_width
1448
1449 Set width of snippets in results
1450
1451 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1452
1453 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1454 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1455
1456 C<$hwidth> specified width of strings from beginning of string. Default
1457 value is C<96>. Negative or zero value keep previous value.
1458
1459 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1460 If negative of zero value is provided previous value is kept unchanged.
1461
1462 =cut
1463
1464 sub set_snippet_width {
1465 my $self = shift;
1466
1467 my ($wwidth, $hwidth, $awidth) = @_;
1468 $self->{wwidth} = $wwidth;
1469 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1470 $self->{awidth} = $awidth if ($awidth >= 0);
1471 }
1472
1473
1474 =head2 set_user
1475
1476 Manage users of node
1477
1478 $node->set_user( 'name', $mode );
1479
1480 C<$mode> can be one of:
1481
1482 =over 4
1483
1484 =item 0
1485
1486 delete account
1487
1488 =item 1
1489
1490 set administrative right for user
1491
1492 =item 2
1493
1494 set user account as guest
1495
1496 =back
1497
1498 Return true on success, otherwise false.
1499
1500 =cut
1501
1502 sub set_user {
1503 my $self = shift;
1504 my ($name, $mode) = @_;
1505
1506 return unless ($self->{url});
1507 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1508
1509 $self->shuttle_url( $self->{url} . '/_set_user',
1510 'text/plain',
1511 'name=' . uri_escape($name) . '&mode=' . $mode,
1512 undef
1513 ) == 200;
1514 }
1515
1516
1517 =head2 set_link
1518
1519 Manage node links
1520
1521 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1522
1523 If C<$credit> is negative, link is removed.
1524
1525 =cut
1526
1527 sub set_link {
1528 my $self = shift;
1529 my ($url, $label, $credit) = @_;
1530
1531 return unless ($self->{url});
1532 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1533
1534 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1535 $reqbody .= '&credit=' . $credit if ($credit > 0);
1536
1537 $self->shuttle_url( $self->{url} . '/_set_link',
1538 'application/x-www-form-urlencoded',
1539 $reqbody,
1540 undef
1541 ) == 200;
1542 }
1543
1544
1545 =head1 PRIVATE METHODS
1546
1547 You could call those directly, but you don't have to. I hope.
1548
1549 =head2 _set_info
1550
1551 Set information for node
1552
1553 $node->_set_info;
1554
1555 =cut
1556
1557 sub _set_info {
1558 my $self = shift;
1559
1560 $self->{status} = -1;
1561 return unless ($self->{url});
1562
1563 my $resbody;
1564 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1565 'text/plain',
1566 undef,
1567 \$resbody,
1568 );
1569
1570 return if ($rv != 200 || !$resbody);
1571
1572 # it seems that response can have multiple line endings
1573 $resbody =~ s/[\r\n]+$//;
1574
1575 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1576 split(/\t/, $resbody, 5);
1577
1578 }
1579
1580 ###
1581
1582 =head1 EXPORT
1583
1584 Nothing.
1585
1586 =head1 SEE ALSO
1587
1588 L<http://hyperestraier.sourceforge.net/>
1589
1590 Hyper Estraier Ruby interface on which this module is based.
1591
1592 =head1 AUTHOR
1593
1594 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1595
1596
1597 =head1 COPYRIGHT AND LICENSE
1598
1599 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1600
1601 This library is free software; you can redistribute it and/or modify
1602 it under the GPL v2 or later.
1603
1604 =cut
1605
1606 1;

  ViewVC Help
Powered by ViewVC 1.1.26