/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (show annotations)
Thu Jan 26 01:53:58 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 29535 byte(s)
added hints to return all hints from server
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 if (defined($nres)) {
54 # for each document in results
55 for my $i ( 0 ... $nres->doc_num - 1 ) {
56 # get result document
57 my $rdoc = $nres->get_doc($i);
58 # display attribte
59 print "URI: ", $rdoc->attr('@uri'),"\n";
60 print "Title: ", $rdoc->attr('@title'),"\n";
61 print $rdoc->snippet,"\n";
62 }
63 } else {
64 die "error: ", $node->status,"\n";
65 }
66
67 =head1 DESCRIPTION
68
69 This module is implementation of node API of Hyper Estraier. Since it's
70 perl-only module with dependencies only on standard perl modules, it will
71 run on all platforms on which perl runs. It doesn't require compilation
72 or Hyper Estraier development files on target machine.
73
74 It is implemented as multiple packages which closly resamble Ruby
75 implementation. It also includes methods to manage nodes.
76
77 There are few examples in C<scripts> directory of this distribution.
78
79 =cut
80
81 =head1 Inheritable common methods
82
83 This methods should really move somewhere else.
84
85 =head2 _s
86
87 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88
89 my $text = $self->_s(" this is a text ");
90 $text = 'this is a text';
91
92 =cut
93
94 sub _s {
95 my $text = $_[1] || return;
96 $text =~ s/\s\s+/ /gs;
97 $text =~ s/^\s+//;
98 $text =~ s/\s+$//;
99 return $text;
100 }
101
102 package Search::Estraier::Document;
103
104 use Carp qw/croak confess/;
105
106 use Search::Estraier;
107 our @ISA = qw/Search::Estraier/;
108
109 =head1 Search::Estraier::Document
110
111 This class implements Document which is collection of attributes
112 (key=value), vectors (also key value) display text and hidden text.
113
114
115 =head2 new
116
117 Create new document, empty or from draft.
118
119 my $doc = new Search::HyperEstraier::Document;
120 my $doc2 = new Search::HyperEstraier::Document( $draft );
121
122 =cut
123
124 sub new {
125 my $class = shift;
126 my $self = {};
127 bless($self, $class);
128
129 $self->{id} = -1;
130
131 my $draft = shift;
132
133 if ($draft) {
134 my $in_text = 0;
135 foreach my $line (split(/\n/, $draft)) {
136
137 if ($in_text) {
138 if ($line =~ /^\t/) {
139 push @{ $self->{htexts} }, substr($line, 1);
140 } else {
141 push @{ $self->{dtexts} }, $line;
142 }
143 next;
144 }
145
146 if ($line =~ m/^%VECTOR\t(.+)$/) {
147 my @fields = split(/\t/, $1);
148 for my $i ( 0 .. ($#fields - 1) ) {
149 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
150 $i++;
151 }
152 next;
153 } elsif ($line =~ m/^%/) {
154 # What is this? comment?
155 #warn "$line\n";
156 next;
157 } elsif ($line =~ m/^$/) {
158 $in_text = 1;
159 next;
160 } elsif ($line =~ m/^(.+)=(.*)$/) {
161 $self->{attrs}->{ $1 } = $2;
162 next;
163 }
164
165 warn "draft ignored: '$line'\n";
166 }
167 }
168
169 $self ? return $self : return undef;
170 }
171
172
173 =head2 add_attr
174
175 Add an attribute.
176
177 $doc->add_attr( name => 'value' );
178
179 Delete attribute using
180
181 $doc->add_attr( name => undef );
182
183 =cut
184
185 sub add_attr {
186 my $self = shift;
187 my $attrs = {@_};
188
189 while (my ($name, $value) = each %{ $attrs }) {
190 if (! defined($value)) {
191 delete( $self->{attrs}->{ $self->_s($name) } );
192 } else {
193 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
194 }
195 }
196
197 return 1;
198 }
199
200
201 =head2 add_text
202
203 Add a sentence of text.
204
205 $doc->add_text('this is example text to display');
206
207 =cut
208
209 sub add_text {
210 my $self = shift;
211 my $text = shift;
212 return unless defined($text);
213
214 push @{ $self->{dtexts} }, $self->_s($text);
215 }
216
217
218 =head2 add_hidden_text
219
220 Add a hidden sentence.
221
222 $doc->add_hidden_text('this is example text just for search');
223
224 =cut
225
226 sub add_hidden_text {
227 my $self = shift;
228 my $text = shift;
229 return unless defined($text);
230
231 push @{ $self->{htexts} }, $self->_s($text);
232 }
233
234
235 =head2 id
236
237 Get the ID number of document. If the object has never been registred, C<-1> is returned.
238
239 print $doc->id;
240
241 =cut
242
243 sub id {
244 my $self = shift;
245 return $self->{id};
246 }
247
248
249 =head2 attr_names
250
251 Returns array with attribute names from document object.
252
253 my @attrs = $doc->attr_names;
254
255 =cut
256
257 sub attr_names {
258 my $self = shift;
259 return unless ($self->{attrs});
260 #croak "attr_names return array, not scalar" if (! wantarray);
261 return sort keys %{ $self->{attrs} };
262 }
263
264
265 =head2 attr
266
267 Returns value of an attribute.
268
269 my $value = $doc->attr( 'attribute' );
270
271 =cut
272
273 sub attr {
274 my $self = shift;
275 my $name = shift;
276 return unless (defined($name) && $self->{attrs});
277 return $self->{attrs}->{ $name };
278 }
279
280
281 =head2 texts
282
283 Returns array with text sentences.
284
285 my @texts = $doc->texts;
286
287 =cut
288
289 sub texts {
290 my $self = shift;
291 #confess "texts return array, not scalar" if (! wantarray);
292 return @{ $self->{dtexts} } if ($self->{dtexts});
293 }
294
295
296 =head2 cat_texts
297
298 Return whole text as single scalar.
299
300 my $text = $doc->cat_texts;
301
302 =cut
303
304 sub cat_texts {
305 my $self = shift;
306 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
307 }
308
309
310 =head2 dump_draft
311
312 Dump draft data from document object.
313
314 print $doc->dump_draft;
315
316 =cut
317
318 sub dump_draft {
319 my $self = shift;
320 my $draft;
321
322 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
323 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
324 }
325
326 if ($self->{kwords}) {
327 $draft .= '%%VECTOR';
328 while (my ($key, $value) = each %{ $self->{kwords} }) {
329 $draft .= "\t$key\t$value";
330 }
331 $draft .= "\n";
332 }
333
334 $draft .= "\n";
335
336 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
337 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
338
339 return $draft;
340 }
341
342
343 =head2 delete
344
345 Empty document object
346
347 $doc->delete;
348
349 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
350 convinience. Document objects which go out of scope will be destroyed
351 automatically.
352
353 =cut
354
355 sub delete {
356 my $self = shift;
357
358 foreach my $data (qw/attrs dtexts stexts kwords/) {
359 delete($self->{$data});
360 }
361
362 $self->{id} = -1;
363
364 return 1;
365 }
366
367
368
369 package Search::Estraier::Condition;
370
371 use Carp qw/confess croak/;
372
373 use Search::Estraier;
374 our @ISA = qw/Search::Estraier/;
375
376 =head1 Search::Estraier::Condition
377
378 =head2 new
379
380 my $cond = new Search::HyperEstraier::Condition;
381
382 =cut
383
384 sub new {
385 my $class = shift;
386 my $self = {};
387 bless($self, $class);
388
389 $self->{max} = -1;
390 $self->{options} = 0;
391
392 $self ? return $self : return undef;
393 }
394
395
396 =head2 set_phrase
397
398 $cond->set_phrase('search phrase');
399
400 =cut
401
402 sub set_phrase {
403 my $self = shift;
404 $self->{phrase} = $self->_s( shift );
405 }
406
407
408 =head2 add_attr
409
410 $cond->add_attr('@URI STRINC /~dpavlin/');
411
412 =cut
413
414 sub add_attr {
415 my $self = shift;
416 my $attr = shift || return;
417 push @{ $self->{attrs} }, $self->_s( $attr );
418 }
419
420
421 =head2 set_order
422
423 $cond->set_order('@mdate NUMD');
424
425 =cut
426
427 sub set_order {
428 my $self = shift;
429 $self->{order} = shift;
430 }
431
432
433 =head2 set_max
434
435 $cond->set_max(42);
436
437 =cut
438
439 sub set_max {
440 my $self = shift;
441 my $max = shift;
442 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
443 $self->{max} = $max;
444 }
445
446
447 =head2 set_options
448
449 $cond->set_options( SURE => 1 );
450
451 =cut
452
453 my $options = {
454 # check N-gram keys skipping by three
455 SURE => 1 << 0,
456 # check N-gram keys skipping by two
457 USUAL => 1 << 1,
458 # without TF-IDF tuning
459 FAST => 1 << 2,
460 # with the simplified phrase
461 AGITO => 1 << 3,
462 # check every N-gram key
463 NOIDF => 1 << 4,
464 # check N-gram keys skipping by one
465 SIMPLE => 1 << 10,
466 };
467
468 sub set_options {
469 my $self = shift;
470 my $option = shift;
471 confess "unknown option" unless ($options->{$option});
472 $self->{options} ||= $options->{$option};
473 }
474
475
476 =head2 phrase
477
478 Return search phrase.
479
480 print $cond->phrase;
481
482 =cut
483
484 sub phrase {
485 my $self = shift;
486 return $self->{phrase};
487 }
488
489
490 =head2 order
491
492 Return search result order.
493
494 print $cond->order;
495
496 =cut
497
498 sub order {
499 my $self = shift;
500 return $self->{order};
501 }
502
503
504 =head2 attrs
505
506 Return search result attrs.
507
508 my @cond_attrs = $cond->attrs;
509
510 =cut
511
512 sub attrs {
513 my $self = shift;
514 #croak "attrs return array, not scalar" if (! wantarray);
515 return @{ $self->{attrs} } if ($self->{attrs});
516 }
517
518
519 =head2 max
520
521 Return maximum number of results.
522
523 print $cond->max;
524
525 C<-1> is returned for unitialized value, C<0> is unlimited.
526
527 =cut
528
529 sub max {
530 my $self = shift;
531 return $self->{max};
532 }
533
534
535 =head2 options
536
537 Return options for this condition.
538
539 print $cond->options;
540
541 Options are returned in numerical form.
542
543 =cut
544
545 sub options {
546 my $self = shift;
547 return $self->{options};
548 }
549
550
551 package Search::Estraier::ResultDocument;
552
553 use Carp qw/croak/;
554
555 #use Search::Estraier;
556 #our @ISA = qw/Search::Estraier/;
557
558 =head1 Search::Estraier::ResultDocument
559
560 =head2 new
561
562 my $rdoc = new Search::HyperEstraier::ResultDocument(
563 uri => 'http://localhost/document/uri/42',
564 attrs => {
565 foo => 1,
566 bar => 2,
567 },
568 snippet => 'this is a text of snippet'
569 keywords => 'this\tare\tkeywords'
570 );
571
572 =cut
573
574 sub new {
575 my $class = shift;
576 my $self = {@_};
577 bless($self, $class);
578
579 croak "missing uri for ResultDocument" unless defined($self->{uri});
580
581 $self ? return $self : return undef;
582 }
583
584
585 =head2 uri
586
587 Return URI of result document
588
589 print $rdoc->uri;
590
591 =cut
592
593 sub uri {
594 my $self = shift;
595 return $self->{uri};
596 }
597
598
599 =head2 attr_names
600
601 Returns array with attribute names from result document object.
602
603 my @attrs = $rdoc->attr_names;
604
605 =cut
606
607 sub attr_names {
608 my $self = shift;
609 croak "attr_names return array, not scalar" if (! wantarray);
610 return sort keys %{ $self->{attrs} };
611 }
612
613
614 =head2 attr
615
616 Returns value of an attribute.
617
618 my $value = $rdoc->attr( 'attribute' );
619
620 =cut
621
622 sub attr {
623 my $self = shift;
624 my $name = shift || return;
625 return $self->{attrs}->{ $name };
626 }
627
628
629 =head2 snippet
630
631 Return snippet from result document
632
633 print $rdoc->snippet;
634
635 =cut
636
637 sub snippet {
638 my $self = shift;
639 return $self->{snippet};
640 }
641
642
643 =head2 keywords
644
645 Return keywords from result document
646
647 print $rdoc->keywords;
648
649 =cut
650
651 sub keywords {
652 my $self = shift;
653 return $self->{keywords};
654 }
655
656
657 package Search::Estraier::NodeResult;
658
659 use Carp qw/croak/;
660
661 #use Search::Estraier;
662 #our @ISA = qw/Search::Estraier/;
663
664 =head1 Search::Estraier::NodeResult
665
666 =head2 new
667
668 my $res = new Search::HyperEstraier::NodeResult(
669 docs => @array_of_rdocs,
670 hits => %hash_with_hints,
671 );
672
673 =cut
674
675 sub new {
676 my $class = shift;
677 my $self = {@_};
678 bless($self, $class);
679
680 foreach my $f (qw/docs hints/) {
681 croak "missing $f for ResultDocument" unless defined($self->{$f});
682 }
683
684 $self ? return $self : return undef;
685 }
686
687
688 =head2 doc_num
689
690 Return number of documents
691
692 print $res->doc_num;
693
694 =cut
695
696 sub doc_num {
697 my $self = shift;
698 return $#{$self->{docs}} + 1;
699 }
700
701
702 =head2 get_doc
703
704 Return single document
705
706 my $doc = $res->get_doc( 42 );
707
708 Returns undef if document doesn't exist.
709
710 =cut
711
712 sub get_doc {
713 my $self = shift;
714 my $num = shift;
715 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
716 return undef if ($num < 0 || $num > $self->{docs});
717 return $self->{docs}->[$num];
718 }
719
720
721 =head2 hint
722
723 Return specific hint from results.
724
725 print $rec->hint( 'VERSION' );
726
727 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
728 C<TIME>, C<LINK#n>, C<VIEW>.
729
730 =cut
731
732 sub hint {
733 my $self = shift;
734 my $key = shift || return;
735 return $self->{hints}->{$key};
736 }
737
738 =head2 hints
739
740 More perlish version of C<hint>. This one returns hash.
741
742 my %hints = $rec->hints;
743
744 =cut
745
746 sub hints {
747 my $self = shift;
748 return $self->{hints};
749 }
750
751 package Search::Estraier::Node;
752
753 use Carp qw/carp croak confess/;
754 use URI;
755 use MIME::Base64;
756 use IO::Socket::INET;
757 use URI::Escape qw/uri_escape/;
758
759 =head1 Search::Estraier::Node
760
761 =head2 new
762
763 my $node = new Search::HyperEstraier::Node;
764
765 or optionally with C<url> as parametar
766
767 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
768
769 or in more verbose form
770
771 my $node = new Search::HyperEstraier::Node(
772 url => 'http://localhost:1978/node/test',
773 debug => 1,
774 croak_on_error => 1
775 );
776
777 with following arguments:
778
779 =over 4
780
781 =item url
782
783 URL to node
784
785 =item debug
786
787 dumps a B<lot> of debugging output
788
789 =item croak_on_error
790
791 very helpful during development. It will croak on all errors instead of
792 silently returning C<-1> (which is convention of Hyper Estraier API in other
793 languages).
794
795 =back
796
797 =cut
798
799 sub new {
800 my $class = shift;
801 my $self = {
802 pxport => -1,
803 timeout => 0, # this used to be -1
804 dnum => -1,
805 wnum => -1,
806 size => -1.0,
807 wwidth => 480,
808 hwidth => 96,
809 awidth => 96,
810 status => -1,
811 };
812 bless($self, $class);
813
814 if ($#_ == 0) {
815 $self->{url} = shift;
816 } else {
817 my $args = {@_};
818
819 %$self = ( %$self, @_ );
820
821 warn "## Node debug on\n" if ($self->{debug});
822 }
823
824 $self ? return $self : return undef;
825 }
826
827
828 =head2 set_url
829
830 Specify URL to node server
831
832 $node->set_url('http://localhost:1978');
833
834 =cut
835
836 sub set_url {
837 my $self = shift;
838 $self->{url} = shift;
839 }
840
841
842 =head2 set_proxy
843
844 Specify proxy server to connect to node server
845
846 $node->set_proxy('proxy.example.com', 8080);
847
848 =cut
849
850 sub set_proxy {
851 my $self = shift;
852 my ($host,$port) = @_;
853 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
854 $self->{pxhost} = $host;
855 $self->{pxport} = $port;
856 }
857
858
859 =head2 set_timeout
860
861 Specify timeout of connection in seconds
862
863 $node->set_timeout( 15 );
864
865 =cut
866
867 sub set_timeout {
868 my $self = shift;
869 my $sec = shift;
870 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
871 $self->{timeout} = $sec;
872 }
873
874
875 =head2 set_auth
876
877 Specify name and password for authentication to node server.
878
879 $node->set_auth('clint','eastwood');
880
881 =cut
882
883 sub set_auth {
884 my $self = shift;
885 my ($login,$passwd) = @_;
886 my $basic_auth = encode_base64( "$login:$passwd" );
887 chomp($basic_auth);
888 $self->{auth} = $basic_auth;
889 }
890
891
892 =head2 status
893
894 Return status code of last request.
895
896 print $node->status;
897
898 C<-1> means connection failure.
899
900 =cut
901
902 sub status {
903 my $self = shift;
904 return $self->{status};
905 }
906
907
908 =head2 put_doc
909
910 Add a document
911
912 $node->put_doc( $document_draft ) or die "can't add document";
913
914 Return true on success or false on failture.
915
916 =cut
917
918 sub put_doc {
919 my $self = shift;
920 my $doc = shift || return;
921 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
922 $self->shuttle_url( $self->{url} . '/put_doc',
923 'text/x-estraier-draft',
924 $doc->dump_draft,
925 undef
926 ) == 200;
927 }
928
929
930 =head2 out_doc
931
932 Remove a document
933
934 $node->out_doc( document_id ) or "can't remove document";
935
936 Return true on success or false on failture.
937
938 =cut
939
940 sub out_doc {
941 my $self = shift;
942 my $id = shift || return;
943 return unless ($self->{url});
944 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
945 $self->shuttle_url( $self->{url} . '/out_doc',
946 'application/x-www-form-urlencoded',
947 "id=$id",
948 undef
949 ) == 200;
950 }
951
952
953 =head2 out_doc_by_uri
954
955 Remove a registrated document using it's uri
956
957 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
958
959 Return true on success or false on failture.
960
961 =cut
962
963 sub out_doc_by_uri {
964 my $self = shift;
965 my $uri = shift || return;
966 return unless ($self->{url});
967 $self->shuttle_url( $self->{url} . '/out_doc',
968 'application/x-www-form-urlencoded',
969 "uri=" . uri_escape($uri),
970 undef
971 ) == 200;
972 }
973
974
975 =head2 edit_doc
976
977 Edit attributes of a document
978
979 $node->edit_doc( $document_draft ) or die "can't edit document";
980
981 Return true on success or false on failture.
982
983 =cut
984
985 sub edit_doc {
986 my $self = shift;
987 my $doc = shift || return;
988 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
989 $self->shuttle_url( $self->{url} . '/edit_doc',
990 'text/x-estraier-draft',
991 $doc->dump_draft,
992 undef
993 ) == 200;
994 }
995
996
997 =head2 get_doc
998
999 Retreive document
1000
1001 my $doc = $node->get_doc( document_id ) or die "can't get document";
1002
1003 Return true on success or false on failture.
1004
1005 =cut
1006
1007 sub get_doc {
1008 my $self = shift;
1009 my $id = shift || return;
1010 return $self->_fetch_doc( id => $id );
1011 }
1012
1013
1014 =head2 get_doc_by_uri
1015
1016 Retreive document
1017
1018 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1019
1020 Return true on success or false on failture.
1021
1022 =cut
1023
1024 sub get_doc_by_uri {
1025 my $self = shift;
1026 my $uri = shift || return;
1027 return $self->_fetch_doc( uri => $uri );
1028 }
1029
1030
1031 =head2 get_doc_attr
1032
1033 Retrieve the value of an atribute from object
1034
1035 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1036 die "can't get document attribute";
1037
1038 =cut
1039
1040 sub get_doc_attr {
1041 my $self = shift;
1042 my ($id,$name) = @_;
1043 return unless ($id && $name);
1044 return $self->_fetch_doc( id => $id, attr => $name );
1045 }
1046
1047
1048 =head2 get_doc_attr_by_uri
1049
1050 Retrieve the value of an atribute from object
1051
1052 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1053 die "can't get document attribute";
1054
1055 =cut
1056
1057 sub get_doc_attr_by_uri {
1058 my $self = shift;
1059 my ($uri,$name) = @_;
1060 return unless ($uri && $name);
1061 return $self->_fetch_doc( uri => $uri, attr => $name );
1062 }
1063
1064
1065 =head2 etch_doc
1066
1067 Exctract document keywords
1068
1069 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1070
1071 =cut
1072
1073 sub etch_doc {
1074 my $self = shift;
1075 my $id = shift || return;
1076 return $self->_fetch_doc( id => $id, etch => 1 );
1077 }
1078
1079 =head2 etch_doc_by_uri
1080
1081 Retreive document
1082
1083 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1084
1085 Return true on success or false on failture.
1086
1087 =cut
1088
1089 sub etch_doc_by_uri {
1090 my $self = shift;
1091 my $uri = shift || return;
1092 return $self->_fetch_doc( uri => $uri, etch => 1 );
1093 }
1094
1095
1096 =head2 uri_to_id
1097
1098 Get ID of document specified by URI
1099
1100 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1101
1102 =cut
1103
1104 sub uri_to_id {
1105 my $self = shift;
1106 my $uri = shift || return;
1107 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1108 }
1109
1110
1111 =head2 _fetch_doc
1112
1113 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1114 C<etch_doc>, C<etch_doc_by_uri>.
1115
1116 # this will decode received draft into Search::Estraier::Document object
1117 my $doc = $node->_fetch_doc( id => 42 );
1118 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1119
1120 # to extract keywords, add etch
1121 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1122 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1123
1124 # to get document attrubute add attr
1125 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1126 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1127
1128 # more general form which allows implementation of
1129 # uri_to_id
1130 my $id = $node->_fetch_doc(
1131 uri => 'file:///document/uri/42',
1132 path => '/uri_to_id',
1133 chomp_resbody => 1
1134 );
1135
1136 =cut
1137
1138 sub _fetch_doc {
1139 my $self = shift;
1140 my $a = {@_};
1141 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1142
1143 my ($arg, $resbody);
1144
1145 my $path = $a->{path} || '/get_doc';
1146 $path = '/etch_doc' if ($a->{etch});
1147
1148 if ($a->{id}) {
1149 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1150 $arg = 'id=' . $a->{id};
1151 } elsif ($a->{uri}) {
1152 $arg = 'uri=' . uri_escape($a->{uri});
1153 } else {
1154 confess "unhandled argument. Need id or uri.";
1155 }
1156
1157 if ($a->{attr}) {
1158 $path = '/get_doc_attr';
1159 $arg .= '&attr=' . uri_escape($a->{attr});
1160 $a->{chomp_resbody} = 1;
1161 }
1162
1163 my $rv = $self->shuttle_url( $self->{url} . $path,
1164 'application/x-www-form-urlencoded',
1165 $arg,
1166 \$resbody,
1167 );
1168
1169 return if ($rv != 200);
1170
1171 if ($a->{etch}) {
1172 $self->{kwords} = {};
1173 return +{} unless ($resbody);
1174 foreach my $l (split(/\n/, $resbody)) {
1175 my ($k,$v) = split(/\t/, $l, 2);
1176 $self->{kwords}->{$k} = $v if ($v);
1177 }
1178 return $self->{kwords};
1179 } elsif ($a->{chomp_resbody}) {
1180 return unless (defined($resbody));
1181 chomp($resbody);
1182 return $resbody;
1183 } else {
1184 return new Search::Estraier::Document($resbody);
1185 }
1186 }
1187
1188
1189 =head2 name
1190
1191 my $node_name = $node->name;
1192
1193 =cut
1194
1195 sub name {
1196 my $self = shift;
1197 $self->_set_info unless ($self->{name});
1198 return $self->{name};
1199 }
1200
1201
1202 =head2 label
1203
1204 my $node_label = $node->label;
1205
1206 =cut
1207
1208 sub label {
1209 my $self = shift;
1210 $self->_set_info unless ($self->{label});
1211 return $self->{label};
1212 }
1213
1214
1215 =head2 doc_num
1216
1217 my $documents_in_node = $node->doc_num;
1218
1219 =cut
1220
1221 sub doc_num {
1222 my $self = shift;
1223 $self->_set_info if ($self->{dnum} < 0);
1224 return $self->{dnum};
1225 }
1226
1227
1228 =head2 word_num
1229
1230 my $words_in_node = $node->word_num;
1231
1232 =cut
1233
1234 sub word_num {
1235 my $self = shift;
1236 $self->_set_info if ($self->{wnum} < 0);
1237 return $self->{wnum};
1238 }
1239
1240
1241 =head2 size
1242
1243 my $node_size = $node->size;
1244
1245 =cut
1246
1247 sub size {
1248 my $self = shift;
1249 $self->_set_info if ($self->{size} < 0);
1250 return $self->{size};
1251 }
1252
1253
1254 =head2 search
1255
1256 Search documents which match condition
1257
1258 my $nres = $node->search( $cond, $depth );
1259
1260 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1261 depth for meta search.
1262
1263 Function results C<Search::Estraier::NodeResult> object.
1264
1265 =cut
1266
1267 sub search {
1268 my $self = shift;
1269 my ($cond, $depth) = @_;
1270 return unless ($cond && defined($depth) && $self->{url});
1271 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1272 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1273
1274 my $resbody;
1275
1276 my $rv = $self->shuttle_url( $self->{url} . '/search',
1277 'application/x-www-form-urlencoded',
1278 $self->cond_to_query( $cond, $depth ),
1279 \$resbody,
1280 );
1281 return if ($rv != 200);
1282
1283 my (@docs, $hints);
1284
1285 my @lines = split(/\n/, $resbody);
1286 return unless (@lines);
1287
1288 my $border = $lines[0];
1289 my $isend = 0;
1290 my $lnum = 1;
1291
1292 while ( $lnum <= $#lines ) {
1293 my $line = $lines[$lnum];
1294 $lnum++;
1295
1296 #warn "## $line\n";
1297 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1298 $isend = $1;
1299 last;
1300 }
1301
1302 if ($line =~ /\t/) {
1303 my ($k,$v) = split(/\t/, $line, 2);
1304 $hints->{$k} = $v;
1305 }
1306 }
1307
1308 my $snum = $lnum;
1309
1310 while( ! $isend && $lnum <= $#lines ) {
1311 my $line = $lines[$lnum];
1312 #warn "# $lnum: $line\n";
1313 $lnum++;
1314
1315 if ($line && $line =~ m/^\Q$border\E/) {
1316 if ($lnum > $snum) {
1317 my $rdattrs;
1318 my $rdvector;
1319 my $rdsnippet;
1320
1321 my $rlnum = $snum;
1322 while ($rlnum < $lnum - 1 ) {
1323 #my $rdline = $self->_s($lines[$rlnum]);
1324 my $rdline = $lines[$rlnum];
1325 $rlnum++;
1326 last unless ($rdline);
1327 if ($rdline =~ /^%/) {
1328 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1329 } elsif($rdline =~ /=/) {
1330 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1331 } else {
1332 confess "invalid format of response";
1333 }
1334 }
1335 while($rlnum < $lnum - 1) {
1336 my $rdline = $lines[$rlnum];
1337 $rlnum++;
1338 $rdsnippet .= "$rdline\n";
1339 }
1340 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1341 if (my $rduri = $rdattrs->{'@uri'}) {
1342 push @docs, new Search::Estraier::ResultDocument(
1343 uri => $rduri,
1344 attrs => $rdattrs,
1345 snippet => $rdsnippet,
1346 keywords => $rdvector,
1347 );
1348 }
1349 }
1350 $snum = $lnum;
1351 #warn "### $line\n";
1352 $isend = 1 if ($line =~ /:END$/);
1353 }
1354
1355 }
1356
1357 if (! $isend) {
1358 warn "received result doesn't have :END\n$resbody";
1359 return;
1360 }
1361
1362 #warn Dumper(\@docs, $hints);
1363
1364 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1365 }
1366
1367
1368 =head2 cond_to_query
1369
1370 Return URI encoded string generated from Search::Estraier::Condition
1371
1372 my $args = $node->cond_to_query( $cond, $depth );
1373
1374 =cut
1375
1376 sub cond_to_query {
1377 my $self = shift;
1378
1379 my $cond = shift || return;
1380 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1381 my $depth = shift;
1382
1383 my @args;
1384
1385 if (my $phrase = $cond->phrase) {
1386 push @args, 'phrase=' . uri_escape($phrase);
1387 }
1388
1389 if (my @attrs = $cond->attrs) {
1390 for my $i ( 0 .. $#attrs ) {
1391 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1392 }
1393 }
1394
1395 if (my $order = $cond->order) {
1396 push @args, 'order=' . uri_escape($order);
1397 }
1398
1399 if (my $max = $cond->max) {
1400 push @args, 'max=' . $max;
1401 } else {
1402 push @args, 'max=' . (1 << 30);
1403 }
1404
1405 if (my $options = $cond->options) {
1406 push @args, 'options=' . $options;
1407 }
1408
1409 push @args, 'depth=' . $depth if ($depth);
1410 push @args, 'wwidth=' . $self->{wwidth};
1411 push @args, 'hwidth=' . $self->{hwidth};
1412 push @args, 'awidth=' . $self->{awidth};
1413
1414 return join('&', @args);
1415 }
1416
1417
1418 =head2 shuttle_url
1419
1420 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1421 master.
1422
1423 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1424
1425 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1426 body will be saved within object.
1427
1428 =cut
1429
1430 use LWP::UserAgent;
1431
1432 sub shuttle_url {
1433 my $self = shift;
1434
1435 my ($url, $content_type, $reqbody, $resbody) = @_;
1436
1437 $self->{status} = -1;
1438
1439 warn "## $url\n" if ($self->{debug});
1440
1441 $url = new URI($url);
1442 if (
1443 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1444 !$url->host || !$url->port || $url->port < 1
1445 ) {
1446 carp "can't parse $url\n";
1447 return -1;
1448 }
1449
1450 my $ua = LWP::UserAgent->new;
1451 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1452
1453 my $req;
1454 if ($reqbody) {
1455 $req = HTTP::Request->new(POST => $url);
1456 } else {
1457 $req = HTTP::Request->new(GET => $url);
1458 }
1459
1460 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1461 $req->headers->header( 'Connection', 'close' );
1462 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1463 $req->content_type( $content_type );
1464
1465 warn $req->headers->as_string,"\n" if ($self->{debug});
1466
1467 if ($reqbody) {
1468 warn "$reqbody\n" if ($self->{debug});
1469 $req->content( $reqbody );
1470 }
1471
1472 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1473
1474 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1475
1476 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1477
1478 if (! $res->is_success) {
1479 if ($self->{croak_on_error}) {
1480 croak("can't get $url: ",$res->status_line);
1481 } else {
1482 return -1;
1483 }
1484 }
1485
1486 $$resbody .= $res->content;
1487
1488 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1489
1490 return $self->{status};
1491 }
1492
1493
1494 =head2 set_snippet_width
1495
1496 Set width of snippets in results
1497
1498 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1499
1500 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1501 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1502
1503 C<$hwidth> specified width of strings from beginning of string. Default
1504 value is C<96>. Negative or zero value keep previous value.
1505
1506 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1507 If negative of zero value is provided previous value is kept unchanged.
1508
1509 =cut
1510
1511 sub set_snippet_width {
1512 my $self = shift;
1513
1514 my ($wwidth, $hwidth, $awidth) = @_;
1515 $self->{wwidth} = $wwidth;
1516 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1517 $self->{awidth} = $awidth if ($awidth >= 0);
1518 }
1519
1520
1521 =head2 set_user
1522
1523 Manage users of node
1524
1525 $node->set_user( 'name', $mode );
1526
1527 C<$mode> can be one of:
1528
1529 =over 4
1530
1531 =item 0
1532
1533 delete account
1534
1535 =item 1
1536
1537 set administrative right for user
1538
1539 =item 2
1540
1541 set user account as guest
1542
1543 =back
1544
1545 Return true on success, otherwise false.
1546
1547 =cut
1548
1549 sub set_user {
1550 my $self = shift;
1551 my ($name, $mode) = @_;
1552
1553 return unless ($self->{url});
1554 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1555
1556 $self->shuttle_url( $self->{url} . '/_set_user',
1557 'text/plain',
1558 'name=' . uri_escape($name) . '&mode=' . $mode,
1559 undef
1560 ) == 200;
1561 }
1562
1563
1564 =head2 set_link
1565
1566 Manage node links
1567
1568 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1569
1570 If C<$credit> is negative, link is removed.
1571
1572 =cut
1573
1574 sub set_link {
1575 my $self = shift;
1576 my ($url, $label, $credit) = @_;
1577
1578 return unless ($self->{url});
1579 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1580
1581 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1582 $reqbody .= '&credit=' . $credit if ($credit > 0);
1583
1584 $self->shuttle_url( $self->{url} . '/_set_link',
1585 'application/x-www-form-urlencoded',
1586 $reqbody,
1587 undef
1588 ) == 200;
1589 }
1590
1591
1592 =head1 PRIVATE METHODS
1593
1594 You could call those directly, but you don't have to. I hope.
1595
1596 =head2 _set_info
1597
1598 Set information for node
1599
1600 $node->_set_info;
1601
1602 =cut
1603
1604 sub _set_info {
1605 my $self = shift;
1606
1607 $self->{status} = -1;
1608 return unless ($self->{url});
1609
1610 my $resbody;
1611 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1612 'text/plain',
1613 undef,
1614 \$resbody,
1615 );
1616
1617 return if ($rv != 200 || !$resbody);
1618
1619 # it seems that response can have multiple line endings
1620 $resbody =~ s/[\r\n]+$//;
1621
1622 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1623 split(/\t/, $resbody, 5);
1624
1625 }
1626
1627 ###
1628
1629 =head1 EXPORT
1630
1631 Nothing.
1632
1633 =head1 SEE ALSO
1634
1635 L<http://hyperestraier.sourceforge.net/>
1636
1637 Hyper Estraier Ruby interface on which this module is based.
1638
1639 =head1 AUTHOR
1640
1641 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1642
1643
1644 =head1 COPYRIGHT AND LICENSE
1645
1646 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1647
1648 This library is free software; you can redistribute it and/or modify
1649 it under the GPL v2 or later.
1650
1651 =cut
1652
1653 1;

  ViewVC Help
Powered by ViewVC 1.1.26