/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 81 - (show annotations)
Tue Jan 17 00:03:45 2006 UTC (18 years, 2 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 29373 byte(s)
allow null (undef in perl) values. Hyper Estraier seems to store them, so we
should also support them.
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 if (defined($nres)) {
54 # for each document in results
55 for my $i ( 0 ... $nres->doc_num - 1 ) {
56 # get result document
57 my $rdoc = $nres->get_doc($i);
58 # display attribte
59 print "URI: ", $rdoc->attr('@uri'),"\n";
60 print "Title: ", $rdoc->attr('@title'),"\n";
61 print $rdoc->snippet,"\n";
62 }
63 } else {
64 die "error: ", $node->status,"\n";
65 }
66
67 =head1 DESCRIPTION
68
69 This module is implementation of node API of Hyper Estraier. Since it's
70 perl-only module with dependencies only on standard perl modules, it will
71 run on all platforms on which perl runs. It doesn't require compilation
72 or Hyper Estraier development files on target machine.
73
74 It is implemented as multiple packages which closly resamble Ruby
75 implementation. It also includes methods to manage nodes.
76
77 There are few examples in C<scripts> directory of this distribution.
78
79 =cut
80
81 =head1 Inheritable common methods
82
83 This methods should really move somewhere else.
84
85 =head2 _s
86
87 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88
89 my $text = $self->_s(" this is a text ");
90 $text = 'this is a text';
91
92 =cut
93
94 sub _s {
95 my $text = $_[1] || return;
96 $text =~ s/\s\s+/ /gs;
97 $text =~ s/^\s+//;
98 $text =~ s/\s+$//;
99 return $text;
100 }
101
102 package Search::Estraier::Document;
103
104 use Carp qw/croak confess/;
105
106 use Search::Estraier;
107 our @ISA = qw/Search::Estraier/;
108
109 =head1 Search::Estraier::Document
110
111 This class implements Document which is collection of attributes
112 (key=value), vectors (also key value) display text and hidden text.
113
114
115 =head2 new
116
117 Create new document, empty or from draft.
118
119 my $doc = new Search::HyperEstraier::Document;
120 my $doc2 = new Search::HyperEstraier::Document( $draft );
121
122 =cut
123
124 sub new {
125 my $class = shift;
126 my $self = {};
127 bless($self, $class);
128
129 $self->{id} = -1;
130
131 my $draft = shift;
132
133 if ($draft) {
134 my $in_text = 0;
135 foreach my $line (split(/\n/, $draft)) {
136
137 if ($in_text) {
138 if ($line =~ /^\t/) {
139 push @{ $self->{htexts} }, substr($line, 1);
140 } else {
141 push @{ $self->{dtexts} }, $line;
142 }
143 next;
144 }
145
146 if ($line =~ m/^%VECTOR\t(.+)$/) {
147 my @fields = split(/\t/, $1);
148 for my $i ( 0 .. ($#fields - 1) ) {
149 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
150 $i++;
151 }
152 next;
153 } elsif ($line =~ m/^%/) {
154 # What is this? comment?
155 #warn "$line\n";
156 next;
157 } elsif ($line =~ m/^$/) {
158 $in_text = 1;
159 next;
160 } elsif ($line =~ m/^(.+)=(.*)$/) {
161 $self->{attrs}->{ $1 } = $2;
162 next;
163 }
164
165 warn "draft ignored: '$line'\n";
166 }
167 }
168
169 $self ? return $self : return undef;
170 }
171
172
173 =head2 add_attr
174
175 Add an attribute.
176
177 $doc->add_attr( name => 'value' );
178
179 Delete attribute using
180
181 $doc->add_attr( name => undef );
182
183 =cut
184
185 sub add_attr {
186 my $self = shift;
187 my $attrs = {@_};
188
189 while (my ($name, $value) = each %{ $attrs }) {
190 if (! defined($value)) {
191 delete( $self->{attrs}->{ $self->_s($name) } );
192 } else {
193 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
194 }
195 }
196
197 return 1;
198 }
199
200
201 =head2 add_text
202
203 Add a sentence of text.
204
205 $doc->add_text('this is example text to display');
206
207 =cut
208
209 sub add_text {
210 my $self = shift;
211 my $text = shift;
212 return unless defined($text);
213
214 push @{ $self->{dtexts} }, $self->_s($text);
215 }
216
217
218 =head2 add_hidden_text
219
220 Add a hidden sentence.
221
222 $doc->add_hidden_text('this is example text just for search');
223
224 =cut
225
226 sub add_hidden_text {
227 my $self = shift;
228 my $text = shift;
229 return unless defined($text);
230
231 push @{ $self->{htexts} }, $self->_s($text);
232 }
233
234
235 =head2 id
236
237 Get the ID number of document. If the object has never been registred, C<-1> is returned.
238
239 print $doc->id;
240
241 =cut
242
243 sub id {
244 my $self = shift;
245 return $self->{id};
246 }
247
248
249 =head2 attr_names
250
251 Returns array with attribute names from document object.
252
253 my @attrs = $doc->attr_names;
254
255 =cut
256
257 sub attr_names {
258 my $self = shift;
259 return unless ($self->{attrs});
260 #croak "attr_names return array, not scalar" if (! wantarray);
261 return sort keys %{ $self->{attrs} };
262 }
263
264
265 =head2 attr
266
267 Returns value of an attribute.
268
269 my $value = $doc->attr( 'attribute' );
270
271 =cut
272
273 sub attr {
274 my $self = shift;
275 my $name = shift;
276 return unless (defined($name) && $self->{attrs});
277 return $self->{attrs}->{ $name };
278 }
279
280
281 =head2 texts
282
283 Returns array with text sentences.
284
285 my @texts = $doc->texts;
286
287 =cut
288
289 sub texts {
290 my $self = shift;
291 #confess "texts return array, not scalar" if (! wantarray);
292 return @{ $self->{dtexts} } if ($self->{dtexts});
293 }
294
295
296 =head2 cat_texts
297
298 Return whole text as single scalar.
299
300 my $text = $doc->cat_texts;
301
302 =cut
303
304 sub cat_texts {
305 my $self = shift;
306 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
307 }
308
309
310 =head2 dump_draft
311
312 Dump draft data from document object.
313
314 print $doc->dump_draft;
315
316 =cut
317
318 sub dump_draft {
319 my $self = shift;
320 my $draft;
321
322 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
323 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
324 }
325
326 if ($self->{kwords}) {
327 $draft .= '%%VECTOR';
328 while (my ($key, $value) = each %{ $self->{kwords} }) {
329 $draft .= "\t$key\t$value";
330 }
331 $draft .= "\n";
332 }
333
334 $draft .= "\n";
335
336 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
337 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
338
339 return $draft;
340 }
341
342
343 =head2 delete
344
345 Empty document object
346
347 $doc->delete;
348
349 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
350 convinience. Document objects which go out of scope will be destroyed
351 automatically.
352
353 =cut
354
355 sub delete {
356 my $self = shift;
357
358 foreach my $data (qw/attrs dtexts stexts kwords/) {
359 delete($self->{$data});
360 }
361
362 $self->{id} = -1;
363
364 return 1;
365 }
366
367
368
369 package Search::Estraier::Condition;
370
371 use Carp qw/confess croak/;
372
373 use Search::Estraier;
374 our @ISA = qw/Search::Estraier/;
375
376 =head1 Search::Estraier::Condition
377
378 =head2 new
379
380 my $cond = new Search::HyperEstraier::Condition;
381
382 =cut
383
384 sub new {
385 my $class = shift;
386 my $self = {};
387 bless($self, $class);
388
389 $self->{max} = -1;
390 $self->{options} = 0;
391
392 $self ? return $self : return undef;
393 }
394
395
396 =head2 set_phrase
397
398 $cond->set_phrase('search phrase');
399
400 =cut
401
402 sub set_phrase {
403 my $self = shift;
404 $self->{phrase} = $self->_s( shift );
405 }
406
407
408 =head2 add_attr
409
410 $cond->add_attr('@URI STRINC /~dpavlin/');
411
412 =cut
413
414 sub add_attr {
415 my $self = shift;
416 my $attr = shift || return;
417 push @{ $self->{attrs} }, $self->_s( $attr );
418 }
419
420
421 =head2 set_order
422
423 $cond->set_order('@mdate NUMD');
424
425 =cut
426
427 sub set_order {
428 my $self = shift;
429 $self->{order} = shift;
430 }
431
432
433 =head2 set_max
434
435 $cond->set_max(42);
436
437 =cut
438
439 sub set_max {
440 my $self = shift;
441 my $max = shift;
442 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
443 $self->{max} = $max;
444 }
445
446
447 =head2 set_options
448
449 $cond->set_options( SURE => 1 );
450
451 =cut
452
453 my $options = {
454 # check N-gram keys skipping by three
455 SURE => 1 << 0,
456 # check N-gram keys skipping by two
457 USUAL => 1 << 1,
458 # without TF-IDF tuning
459 FAST => 1 << 2,
460 # with the simplified phrase
461 AGITO => 1 << 3,
462 # check every N-gram key
463 NOIDF => 1 << 4,
464 # check N-gram keys skipping by one
465 SIMPLE => 1 << 10,
466 };
467
468 sub set_options {
469 my $self = shift;
470 my $option = shift;
471 confess "unknown option" unless ($options->{$option});
472 $self->{options} ||= $options->{$option};
473 }
474
475
476 =head2 phrase
477
478 Return search phrase.
479
480 print $cond->phrase;
481
482 =cut
483
484 sub phrase {
485 my $self = shift;
486 return $self->{phrase};
487 }
488
489
490 =head2 order
491
492 Return search result order.
493
494 print $cond->order;
495
496 =cut
497
498 sub order {
499 my $self = shift;
500 return $self->{order};
501 }
502
503
504 =head2 attrs
505
506 Return search result attrs.
507
508 my @cond_attrs = $cond->attrs;
509
510 =cut
511
512 sub attrs {
513 my $self = shift;
514 #croak "attrs return array, not scalar" if (! wantarray);
515 return @{ $self->{attrs} } if ($self->{attrs});
516 }
517
518
519 =head2 max
520
521 Return maximum number of results.
522
523 print $cond->max;
524
525 C<-1> is returned for unitialized value, C<0> is unlimited.
526
527 =cut
528
529 sub max {
530 my $self = shift;
531 return $self->{max};
532 }
533
534
535 =head2 options
536
537 Return options for this condition.
538
539 print $cond->options;
540
541 Options are returned in numerical form.
542
543 =cut
544
545 sub options {
546 my $self = shift;
547 return $self->{options};
548 }
549
550
551 package Search::Estraier::ResultDocument;
552
553 use Carp qw/croak/;
554
555 #use Search::Estraier;
556 #our @ISA = qw/Search::Estraier/;
557
558 =head1 Search::Estraier::ResultDocument
559
560 =head2 new
561
562 my $rdoc = new Search::HyperEstraier::ResultDocument(
563 uri => 'http://localhost/document/uri/42',
564 attrs => {
565 foo => 1,
566 bar => 2,
567 },
568 snippet => 'this is a text of snippet'
569 keywords => 'this\tare\tkeywords'
570 );
571
572 =cut
573
574 sub new {
575 my $class = shift;
576 my $self = {@_};
577 bless($self, $class);
578
579 croak "missing uri for ResultDocument" unless defined($self->{uri});
580
581 $self ? return $self : return undef;
582 }
583
584
585 =head2 uri
586
587 Return URI of result document
588
589 print $rdoc->uri;
590
591 =cut
592
593 sub uri {
594 my $self = shift;
595 return $self->{uri};
596 }
597
598
599 =head2 attr_names
600
601 Returns array with attribute names from result document object.
602
603 my @attrs = $rdoc->attr_names;
604
605 =cut
606
607 sub attr_names {
608 my $self = shift;
609 croak "attr_names return array, not scalar" if (! wantarray);
610 return sort keys %{ $self->{attrs} };
611 }
612
613
614 =head2 attr
615
616 Returns value of an attribute.
617
618 my $value = $rdoc->attr( 'attribute' );
619
620 =cut
621
622 sub attr {
623 my $self = shift;
624 my $name = shift || return;
625 return $self->{attrs}->{ $name };
626 }
627
628
629 =head2 snippet
630
631 Return snippet from result document
632
633 print $rdoc->snippet;
634
635 =cut
636
637 sub snippet {
638 my $self = shift;
639 return $self->{snippet};
640 }
641
642
643 =head2 keywords
644
645 Return keywords from result document
646
647 print $rdoc->keywords;
648
649 =cut
650
651 sub keywords {
652 my $self = shift;
653 return $self->{keywords};
654 }
655
656
657 package Search::Estraier::NodeResult;
658
659 use Carp qw/croak/;
660
661 #use Search::Estraier;
662 #our @ISA = qw/Search::Estraier/;
663
664 =head1 Search::Estraier::NodeResult
665
666 =head2 new
667
668 my $res = new Search::HyperEstraier::NodeResult(
669 docs => @array_of_rdocs,
670 hits => %hash_with_hints,
671 );
672
673 =cut
674
675 sub new {
676 my $class = shift;
677 my $self = {@_};
678 bless($self, $class);
679
680 foreach my $f (qw/docs hints/) {
681 croak "missing $f for ResultDocument" unless defined($self->{$f});
682 }
683
684 $self ? return $self : return undef;
685 }
686
687
688 =head2 doc_num
689
690 Return number of documents
691
692 print $res->doc_num;
693
694 =cut
695
696 sub doc_num {
697 my $self = shift;
698 return $#{$self->{docs}} + 1;
699 }
700
701
702 =head2 get_doc
703
704 Return single document
705
706 my $doc = $res->get_doc( 42 );
707
708 Returns undef if document doesn't exist.
709
710 =cut
711
712 sub get_doc {
713 my $self = shift;
714 my $num = shift;
715 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
716 return undef if ($num < 0 || $num > $self->{docs});
717 return $self->{docs}->[$num];
718 }
719
720
721 =head2 hint
722
723 Return specific hint from results.
724
725 print $rec->hint( 'VERSION' );
726
727 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
728 C<TIME>, C<LINK#n>, C<VIEW>.
729
730 =cut
731
732 sub hint {
733 my $self = shift;
734 my $key = shift || return;
735 return $self->{hints}->{$key};
736 }
737
738
739 package Search::Estraier::Node;
740
741 use Carp qw/carp croak confess/;
742 use URI;
743 use MIME::Base64;
744 use IO::Socket::INET;
745 use URI::Escape qw/uri_escape/;
746
747 =head1 Search::Estraier::Node
748
749 =head2 new
750
751 my $node = new Search::HyperEstraier::Node;
752
753 or optionally with C<url> as parametar
754
755 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
756
757 or in more verbose form
758
759 my $node = new Search::HyperEstraier::Node(
760 url => 'http://localhost:1978/node/test',
761 debug => 1,
762 croak_on_error => 1
763 );
764
765 with following arguments:
766
767 =over 4
768
769 =item url
770
771 URL to node
772
773 =item debug
774
775 dumps a B<lot> of debugging output
776
777 =item croak_on_error
778
779 very helpful during development. It will croak on all errors instead of
780 silently returning C<-1> (which is convention of Hyper Estraier API in other
781 languages).
782
783 =back
784
785 =cut
786
787 sub new {
788 my $class = shift;
789 my $self = {
790 pxport => -1,
791 timeout => 0, # this used to be -1
792 dnum => -1,
793 wnum => -1,
794 size => -1.0,
795 wwidth => 480,
796 hwidth => 96,
797 awidth => 96,
798 status => -1,
799 };
800 bless($self, $class);
801
802 if ($#_ == 0) {
803 $self->{url} = shift;
804 } else {
805 my $args = {@_};
806
807 %$self = ( %$self, @_ );
808
809 warn "## Node debug on\n" if ($self->{debug});
810 }
811
812 $self ? return $self : return undef;
813 }
814
815
816 =head2 set_url
817
818 Specify URL to node server
819
820 $node->set_url('http://localhost:1978');
821
822 =cut
823
824 sub set_url {
825 my $self = shift;
826 $self->{url} = shift;
827 }
828
829
830 =head2 set_proxy
831
832 Specify proxy server to connect to node server
833
834 $node->set_proxy('proxy.example.com', 8080);
835
836 =cut
837
838 sub set_proxy {
839 my $self = shift;
840 my ($host,$port) = @_;
841 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
842 $self->{pxhost} = $host;
843 $self->{pxport} = $port;
844 }
845
846
847 =head2 set_timeout
848
849 Specify timeout of connection in seconds
850
851 $node->set_timeout( 15 );
852
853 =cut
854
855 sub set_timeout {
856 my $self = shift;
857 my $sec = shift;
858 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
859 $self->{timeout} = $sec;
860 }
861
862
863 =head2 set_auth
864
865 Specify name and password for authentication to node server.
866
867 $node->set_auth('clint','eastwood');
868
869 =cut
870
871 sub set_auth {
872 my $self = shift;
873 my ($login,$passwd) = @_;
874 my $basic_auth = encode_base64( "$login:$passwd" );
875 chomp($basic_auth);
876 $self->{auth} = $basic_auth;
877 }
878
879
880 =head2 status
881
882 Return status code of last request.
883
884 print $node->status;
885
886 C<-1> means connection failure.
887
888 =cut
889
890 sub status {
891 my $self = shift;
892 return $self->{status};
893 }
894
895
896 =head2 put_doc
897
898 Add a document
899
900 $node->put_doc( $document_draft ) or die "can't add document";
901
902 Return true on success or false on failture.
903
904 =cut
905
906 sub put_doc {
907 my $self = shift;
908 my $doc = shift || return;
909 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
910 $self->shuttle_url( $self->{url} . '/put_doc',
911 'text/x-estraier-draft',
912 $doc->dump_draft,
913 undef
914 ) == 200;
915 }
916
917
918 =head2 out_doc
919
920 Remove a document
921
922 $node->out_doc( document_id ) or "can't remove document";
923
924 Return true on success or false on failture.
925
926 =cut
927
928 sub out_doc {
929 my $self = shift;
930 my $id = shift || return;
931 return unless ($self->{url});
932 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
933 $self->shuttle_url( $self->{url} . '/out_doc',
934 'application/x-www-form-urlencoded',
935 "id=$id",
936 undef
937 ) == 200;
938 }
939
940
941 =head2 out_doc_by_uri
942
943 Remove a registrated document using it's uri
944
945 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
946
947 Return true on success or false on failture.
948
949 =cut
950
951 sub out_doc_by_uri {
952 my $self = shift;
953 my $uri = shift || return;
954 return unless ($self->{url});
955 $self->shuttle_url( $self->{url} . '/out_doc',
956 'application/x-www-form-urlencoded',
957 "uri=" . uri_escape($uri),
958 undef
959 ) == 200;
960 }
961
962
963 =head2 edit_doc
964
965 Edit attributes of a document
966
967 $node->edit_doc( $document_draft ) or die "can't edit document";
968
969 Return true on success or false on failture.
970
971 =cut
972
973 sub edit_doc {
974 my $self = shift;
975 my $doc = shift || return;
976 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
977 $self->shuttle_url( $self->{url} . '/edit_doc',
978 'text/x-estraier-draft',
979 $doc->dump_draft,
980 undef
981 ) == 200;
982 }
983
984
985 =head2 get_doc
986
987 Retreive document
988
989 my $doc = $node->get_doc( document_id ) or die "can't get document";
990
991 Return true on success or false on failture.
992
993 =cut
994
995 sub get_doc {
996 my $self = shift;
997 my $id = shift || return;
998 return $self->_fetch_doc( id => $id );
999 }
1000
1001
1002 =head2 get_doc_by_uri
1003
1004 Retreive document
1005
1006 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1007
1008 Return true on success or false on failture.
1009
1010 =cut
1011
1012 sub get_doc_by_uri {
1013 my $self = shift;
1014 my $uri = shift || return;
1015 return $self->_fetch_doc( uri => $uri );
1016 }
1017
1018
1019 =head2 get_doc_attr
1020
1021 Retrieve the value of an atribute from object
1022
1023 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1024 die "can't get document attribute";
1025
1026 =cut
1027
1028 sub get_doc_attr {
1029 my $self = shift;
1030 my ($id,$name) = @_;
1031 return unless ($id && $name);
1032 return $self->_fetch_doc( id => $id, attr => $name );
1033 }
1034
1035
1036 =head2 get_doc_attr_by_uri
1037
1038 Retrieve the value of an atribute from object
1039
1040 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1041 die "can't get document attribute";
1042
1043 =cut
1044
1045 sub get_doc_attr_by_uri {
1046 my $self = shift;
1047 my ($uri,$name) = @_;
1048 return unless ($uri && $name);
1049 return $self->_fetch_doc( uri => $uri, attr => $name );
1050 }
1051
1052
1053 =head2 etch_doc
1054
1055 Exctract document keywords
1056
1057 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1058
1059 =cut
1060
1061 sub etch_doc {
1062 my $self = shift;
1063 my $id = shift || return;
1064 return $self->_fetch_doc( id => $id, etch => 1 );
1065 }
1066
1067 =head2 etch_doc_by_uri
1068
1069 Retreive document
1070
1071 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1072
1073 Return true on success or false on failture.
1074
1075 =cut
1076
1077 sub etch_doc_by_uri {
1078 my $self = shift;
1079 my $uri = shift || return;
1080 return $self->_fetch_doc( uri => $uri, etch => 1 );
1081 }
1082
1083
1084 =head2 uri_to_id
1085
1086 Get ID of document specified by URI
1087
1088 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1089
1090 =cut
1091
1092 sub uri_to_id {
1093 my $self = shift;
1094 my $uri = shift || return;
1095 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1096 }
1097
1098
1099 =head2 _fetch_doc
1100
1101 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1102 C<etch_doc>, C<etch_doc_by_uri>.
1103
1104 # this will decode received draft into Search::Estraier::Document object
1105 my $doc = $node->_fetch_doc( id => 42 );
1106 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1107
1108 # to extract keywords, add etch
1109 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1110 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1111
1112 # to get document attrubute add attr
1113 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1114 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1115
1116 # more general form which allows implementation of
1117 # uri_to_id
1118 my $id = $node->_fetch_doc(
1119 uri => 'file:///document/uri/42',
1120 path => '/uri_to_id',
1121 chomp_resbody => 1
1122 );
1123
1124 =cut
1125
1126 sub _fetch_doc {
1127 my $self = shift;
1128 my $a = {@_};
1129 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1130
1131 my ($arg, $resbody);
1132
1133 my $path = $a->{path} || '/get_doc';
1134 $path = '/etch_doc' if ($a->{etch});
1135
1136 if ($a->{id}) {
1137 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1138 $arg = 'id=' . $a->{id};
1139 } elsif ($a->{uri}) {
1140 $arg = 'uri=' . uri_escape($a->{uri});
1141 } else {
1142 confess "unhandled argument. Need id or uri.";
1143 }
1144
1145 if ($a->{attr}) {
1146 $path = '/get_doc_attr';
1147 $arg .= '&attr=' . uri_escape($a->{attr});
1148 $a->{chomp_resbody} = 1;
1149 }
1150
1151 my $rv = $self->shuttle_url( $self->{url} . $path,
1152 'application/x-www-form-urlencoded',
1153 $arg,
1154 \$resbody,
1155 );
1156
1157 return if ($rv != 200);
1158
1159 if ($a->{etch}) {
1160 $self->{kwords} = {};
1161 return +{} unless ($resbody);
1162 foreach my $l (split(/\n/, $resbody)) {
1163 my ($k,$v) = split(/\t/, $l, 2);
1164 $self->{kwords}->{$k} = $v if ($v);
1165 }
1166 return $self->{kwords};
1167 } elsif ($a->{chomp_resbody}) {
1168 return unless (defined($resbody));
1169 chomp($resbody);
1170 return $resbody;
1171 } else {
1172 return new Search::Estraier::Document($resbody);
1173 }
1174 }
1175
1176
1177 =head2 name
1178
1179 my $node_name = $node->name;
1180
1181 =cut
1182
1183 sub name {
1184 my $self = shift;
1185 $self->_set_info unless ($self->{name});
1186 return $self->{name};
1187 }
1188
1189
1190 =head2 label
1191
1192 my $node_label = $node->label;
1193
1194 =cut
1195
1196 sub label {
1197 my $self = shift;
1198 $self->_set_info unless ($self->{label});
1199 return $self->{label};
1200 }
1201
1202
1203 =head2 doc_num
1204
1205 my $documents_in_node = $node->doc_num;
1206
1207 =cut
1208
1209 sub doc_num {
1210 my $self = shift;
1211 $self->_set_info if ($self->{dnum} < 0);
1212 return $self->{dnum};
1213 }
1214
1215
1216 =head2 word_num
1217
1218 my $words_in_node = $node->word_num;
1219
1220 =cut
1221
1222 sub word_num {
1223 my $self = shift;
1224 $self->_set_info if ($self->{wnum} < 0);
1225 return $self->{wnum};
1226 }
1227
1228
1229 =head2 size
1230
1231 my $node_size = $node->size;
1232
1233 =cut
1234
1235 sub size {
1236 my $self = shift;
1237 $self->_set_info if ($self->{size} < 0);
1238 return $self->{size};
1239 }
1240
1241
1242 =head2 search
1243
1244 Search documents which match condition
1245
1246 my $nres = $node->search( $cond, $depth );
1247
1248 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1249 depth for meta search.
1250
1251 Function results C<Search::Estraier::NodeResult> object.
1252
1253 =cut
1254
1255 sub search {
1256 my $self = shift;
1257 my ($cond, $depth) = @_;
1258 return unless ($cond && defined($depth) && $self->{url});
1259 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1260 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1261
1262 my $resbody;
1263
1264 my $rv = $self->shuttle_url( $self->{url} . '/search',
1265 'application/x-www-form-urlencoded',
1266 $self->cond_to_query( $cond, $depth ),
1267 \$resbody,
1268 );
1269 return if ($rv != 200);
1270
1271 my (@docs, $hints);
1272
1273 my @lines = split(/\n/, $resbody);
1274 return unless (@lines);
1275
1276 my $border = $lines[0];
1277 my $isend = 0;
1278 my $lnum = 1;
1279
1280 while ( $lnum <= $#lines ) {
1281 my $line = $lines[$lnum];
1282 $lnum++;
1283
1284 #warn "## $line\n";
1285 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1286 $isend = $1;
1287 last;
1288 }
1289
1290 if ($line =~ /\t/) {
1291 my ($k,$v) = split(/\t/, $line, 2);
1292 $hints->{$k} = $v;
1293 }
1294 }
1295
1296 my $snum = $lnum;
1297
1298 while( ! $isend && $lnum <= $#lines ) {
1299 my $line = $lines[$lnum];
1300 #warn "# $lnum: $line\n";
1301 $lnum++;
1302
1303 if ($line && $line =~ m/^\Q$border\E/) {
1304 if ($lnum > $snum) {
1305 my $rdattrs;
1306 my $rdvector;
1307 my $rdsnippet;
1308
1309 my $rlnum = $snum;
1310 while ($rlnum < $lnum - 1 ) {
1311 #my $rdline = $self->_s($lines[$rlnum]);
1312 my $rdline = $lines[$rlnum];
1313 $rlnum++;
1314 last unless ($rdline);
1315 if ($rdline =~ /^%/) {
1316 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1317 } elsif($rdline =~ /=/) {
1318 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1319 } else {
1320 confess "invalid format of response";
1321 }
1322 }
1323 while($rlnum < $lnum - 1) {
1324 my $rdline = $lines[$rlnum];
1325 $rlnum++;
1326 $rdsnippet .= "$rdline\n";
1327 }
1328 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1329 if (my $rduri = $rdattrs->{'@uri'}) {
1330 push @docs, new Search::Estraier::ResultDocument(
1331 uri => $rduri,
1332 attrs => $rdattrs,
1333 snippet => $rdsnippet,
1334 keywords => $rdvector,
1335 );
1336 }
1337 }
1338 $snum = $lnum;
1339 #warn "### $line\n";
1340 $isend = 1 if ($line =~ /:END$/);
1341 }
1342
1343 }
1344
1345 if (! $isend) {
1346 warn "received result doesn't have :END\n$resbody";
1347 return;
1348 }
1349
1350 #warn Dumper(\@docs, $hints);
1351
1352 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1353 }
1354
1355
1356 =head2 cond_to_query
1357
1358 Return URI encoded string generated from Search::Estraier::Condition
1359
1360 my $args = $node->cond_to_query( $cond, $depth );
1361
1362 =cut
1363
1364 sub cond_to_query {
1365 my $self = shift;
1366
1367 my $cond = shift || return;
1368 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1369 my $depth = shift;
1370
1371 my @args;
1372
1373 if (my $phrase = $cond->phrase) {
1374 push @args, 'phrase=' . uri_escape($phrase);
1375 }
1376
1377 if (my @attrs = $cond->attrs) {
1378 for my $i ( 0 .. $#attrs ) {
1379 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1380 }
1381 }
1382
1383 if (my $order = $cond->order) {
1384 push @args, 'order=' . uri_escape($order);
1385 }
1386
1387 if (my $max = $cond->max) {
1388 push @args, 'max=' . $max;
1389 } else {
1390 push @args, 'max=' . (1 << 30);
1391 }
1392
1393 if (my $options = $cond->options) {
1394 push @args, 'options=' . $options;
1395 }
1396
1397 push @args, 'depth=' . $depth if ($depth);
1398 push @args, 'wwidth=' . $self->{wwidth};
1399 push @args, 'hwidth=' . $self->{hwidth};
1400 push @args, 'awidth=' . $self->{awidth};
1401
1402 return join('&', @args);
1403 }
1404
1405
1406 =head2 shuttle_url
1407
1408 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1409 master.
1410
1411 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1412
1413 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1414 body will be saved within object.
1415
1416 =cut
1417
1418 use LWP::UserAgent;
1419
1420 sub shuttle_url {
1421 my $self = shift;
1422
1423 my ($url, $content_type, $reqbody, $resbody) = @_;
1424
1425 $self->{status} = -1;
1426
1427 warn "## $url\n" if ($self->{debug});
1428
1429 $url = new URI($url);
1430 if (
1431 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1432 !$url->host || !$url->port || $url->port < 1
1433 ) {
1434 carp "can't parse $url\n";
1435 return -1;
1436 }
1437
1438 my $ua = LWP::UserAgent->new;
1439 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1440
1441 my $req;
1442 if ($reqbody) {
1443 $req = HTTP::Request->new(POST => $url);
1444 } else {
1445 $req = HTTP::Request->new(GET => $url);
1446 }
1447
1448 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1449 $req->headers->header( 'Connection', 'close' );
1450 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1451 $req->content_type( $content_type );
1452
1453 warn $req->headers->as_string,"\n" if ($self->{debug});
1454
1455 if ($reqbody) {
1456 warn "$reqbody\n" if ($self->{debug});
1457 $req->content( $reqbody );
1458 }
1459
1460 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1461
1462 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1463
1464 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1465
1466 if (! $res->is_success) {
1467 if ($self->{croak_on_error}) {
1468 croak("can't get $url: ",$res->status_line);
1469 } else {
1470 return -1;
1471 }
1472 }
1473
1474 $$resbody .= $res->content;
1475
1476 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1477
1478 return $self->{status};
1479 }
1480
1481
1482 =head2 set_snippet_width
1483
1484 Set width of snippets in results
1485
1486 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1487
1488 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1489 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1490
1491 C<$hwidth> specified width of strings from beginning of string. Default
1492 value is C<96>. Negative or zero value keep previous value.
1493
1494 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1495 If negative of zero value is provided previous value is kept unchanged.
1496
1497 =cut
1498
1499 sub set_snippet_width {
1500 my $self = shift;
1501
1502 my ($wwidth, $hwidth, $awidth) = @_;
1503 $self->{wwidth} = $wwidth;
1504 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1505 $self->{awidth} = $awidth if ($awidth >= 0);
1506 }
1507
1508
1509 =head2 set_user
1510
1511 Manage users of node
1512
1513 $node->set_user( 'name', $mode );
1514
1515 C<$mode> can be one of:
1516
1517 =over 4
1518
1519 =item 0
1520
1521 delete account
1522
1523 =item 1
1524
1525 set administrative right for user
1526
1527 =item 2
1528
1529 set user account as guest
1530
1531 =back
1532
1533 Return true on success, otherwise false.
1534
1535 =cut
1536
1537 sub set_user {
1538 my $self = shift;
1539 my ($name, $mode) = @_;
1540
1541 return unless ($self->{url});
1542 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1543
1544 $self->shuttle_url( $self->{url} . '/_set_user',
1545 'text/plain',
1546 'name=' . uri_escape($name) . '&mode=' . $mode,
1547 undef
1548 ) == 200;
1549 }
1550
1551
1552 =head2 set_link
1553
1554 Manage node links
1555
1556 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1557
1558 If C<$credit> is negative, link is removed.
1559
1560 =cut
1561
1562 sub set_link {
1563 my $self = shift;
1564 my ($url, $label, $credit) = @_;
1565
1566 return unless ($self->{url});
1567 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1568
1569 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1570 $reqbody .= '&credit=' . $credit if ($credit > 0);
1571
1572 $self->shuttle_url( $self->{url} . '/_set_link',
1573 'application/x-www-form-urlencoded',
1574 $reqbody,
1575 undef
1576 ) == 200;
1577 }
1578
1579
1580 =head1 PRIVATE METHODS
1581
1582 You could call those directly, but you don't have to. I hope.
1583
1584 =head2 _set_info
1585
1586 Set information for node
1587
1588 $node->_set_info;
1589
1590 =cut
1591
1592 sub _set_info {
1593 my $self = shift;
1594
1595 $self->{status} = -1;
1596 return unless ($self->{url});
1597
1598 my $resbody;
1599 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1600 'text/plain',
1601 undef,
1602 \$resbody,
1603 );
1604
1605 return if ($rv != 200 || !$resbody);
1606
1607 # it seems that response can have multiple line endings
1608 $resbody =~ s/[\r\n]+$//;
1609
1610 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1611 split(/\t/, $resbody, 5);
1612
1613 }
1614
1615 ###
1616
1617 =head1 EXPORT
1618
1619 Nothing.
1620
1621 =head1 SEE ALSO
1622
1623 L<http://hyperestraier.sourceforge.net/>
1624
1625 Hyper Estraier Ruby interface on which this module is based.
1626
1627 =head1 AUTHOR
1628
1629 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1630
1631
1632 =head1 COPYRIGHT AND LICENSE
1633
1634 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1635
1636 This library is free software; you can redistribute it and/or modify
1637 it under the GPL v2 or later.
1638
1639 =cut
1640
1641 1;

  ViewVC Help
Powered by ViewVC 1.1.26