/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 97 - (show annotations)
Sat Jan 28 18:19:47 2006 UTC (18 years, 2 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 29591 byte(s)
another fix for empty values
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 if (defined($nres)) {
54 # for each document in results
55 for my $i ( 0 ... $nres->doc_num - 1 ) {
56 # get result document
57 my $rdoc = $nres->get_doc($i);
58 # display attribte
59 print "URI: ", $rdoc->attr('@uri'),"\n";
60 print "Title: ", $rdoc->attr('@title'),"\n";
61 print $rdoc->snippet,"\n";
62 }
63 } else {
64 die "error: ", $node->status,"\n";
65 }
66
67 =head1 DESCRIPTION
68
69 This module is implementation of node API of Hyper Estraier. Since it's
70 perl-only module with dependencies only on standard perl modules, it will
71 run on all platforms on which perl runs. It doesn't require compilation
72 or Hyper Estraier development files on target machine.
73
74 It is implemented as multiple packages which closly resamble Ruby
75 implementation. It also includes methods to manage nodes.
76
77 There are few examples in C<scripts> directory of this distribution.
78
79 =cut
80
81 =head1 Inheritable common methods
82
83 This methods should really move somewhere else.
84
85 =head2 _s
86
87 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88
89 my $text = $self->_s(" this is a text ");
90 $text = 'this is a text';
91
92 =cut
93
94 sub _s {
95 my $text = $_[1];
96 return unless defined($text);
97 $text =~ s/\s\s+/ /gs;
98 $text =~ s/^\s+//;
99 $text =~ s/\s+$//;
100 return $text;
101 }
102
103 package Search::Estraier::Document;
104
105 use Carp qw/croak confess/;
106
107 use Search::Estraier;
108 our @ISA = qw/Search::Estraier/;
109
110 =head1 Search::Estraier::Document
111
112 This class implements Document which is collection of attributes
113 (key=value), vectors (also key value) display text and hidden text.
114
115
116 =head2 new
117
118 Create new document, empty or from draft.
119
120 my $doc = new Search::HyperEstraier::Document;
121 my $doc2 = new Search::HyperEstraier::Document( $draft );
122
123 =cut
124
125 sub new {
126 my $class = shift;
127 my $self = {};
128 bless($self, $class);
129
130 $self->{id} = -1;
131
132 my $draft = shift;
133
134 if ($draft) {
135 my $in_text = 0;
136 foreach my $line (split(/\n/, $draft)) {
137
138 if ($in_text) {
139 if ($line =~ /^\t/) {
140 push @{ $self->{htexts} }, substr($line, 1);
141 } else {
142 push @{ $self->{dtexts} }, $line;
143 }
144 next;
145 }
146
147 if ($line =~ m/^%VECTOR\t(.+)$/) {
148 my @fields = split(/\t/, $1);
149 for my $i ( 0 .. ($#fields - 1) ) {
150 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
151 $i++;
152 }
153 next;
154 } elsif ($line =~ m/^%/) {
155 # What is this? comment?
156 #warn "$line\n";
157 next;
158 } elsif ($line =~ m/^$/) {
159 $in_text = 1;
160 next;
161 } elsif ($line =~ m/^(.+)=(.*)$/) {
162 $self->{attrs}->{ $1 } = $2;
163 next;
164 }
165
166 warn "draft ignored: '$line'\n";
167 }
168 }
169
170 $self ? return $self : return undef;
171 }
172
173
174 =head2 add_attr
175
176 Add an attribute.
177
178 $doc->add_attr( name => 'value' );
179
180 Delete attribute using
181
182 $doc->add_attr( name => undef );
183
184 =cut
185
186 sub add_attr {
187 my $self = shift;
188 my $attrs = {@_};
189
190 while (my ($name, $value) = each %{ $attrs }) {
191 if (! defined($value)) {
192 delete( $self->{attrs}->{ $self->_s($name) } );
193 } else {
194 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
195 }
196 }
197
198 return 1;
199 }
200
201
202 =head2 add_text
203
204 Add a sentence of text.
205
206 $doc->add_text('this is example text to display');
207
208 =cut
209
210 sub add_text {
211 my $self = shift;
212 my $text = shift;
213 return unless defined($text);
214
215 push @{ $self->{dtexts} }, $self->_s($text);
216 }
217
218
219 =head2 add_hidden_text
220
221 Add a hidden sentence.
222
223 $doc->add_hidden_text('this is example text just for search');
224
225 =cut
226
227 sub add_hidden_text {
228 my $self = shift;
229 my $text = shift;
230 return unless defined($text);
231
232 push @{ $self->{htexts} }, $self->_s($text);
233 }
234
235
236 =head2 id
237
238 Get the ID number of document. If the object has never been registred, C<-1> is returned.
239
240 print $doc->id;
241
242 =cut
243
244 sub id {
245 my $self = shift;
246 return $self->{id};
247 }
248
249
250 =head2 attr_names
251
252 Returns array with attribute names from document object.
253
254 my @attrs = $doc->attr_names;
255
256 =cut
257
258 sub attr_names {
259 my $self = shift;
260 return unless ($self->{attrs});
261 #croak "attr_names return array, not scalar" if (! wantarray);
262 return sort keys %{ $self->{attrs} };
263 }
264
265
266 =head2 attr
267
268 Returns value of an attribute.
269
270 my $value = $doc->attr( 'attribute' );
271
272 =cut
273
274 sub attr {
275 my $self = shift;
276 my $name = shift;
277 return unless (defined($name) && $self->{attrs});
278 return $self->{attrs}->{ $name };
279 }
280
281
282 =head2 texts
283
284 Returns array with text sentences.
285
286 my @texts = $doc->texts;
287
288 =cut
289
290 sub texts {
291 my $self = shift;
292 #confess "texts return array, not scalar" if (! wantarray);
293 return @{ $self->{dtexts} } if ($self->{dtexts});
294 }
295
296
297 =head2 cat_texts
298
299 Return whole text as single scalar.
300
301 my $text = $doc->cat_texts;
302
303 =cut
304
305 sub cat_texts {
306 my $self = shift;
307 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
308 }
309
310
311 =head2 dump_draft
312
313 Dump draft data from document object.
314
315 print $doc->dump_draft;
316
317 =cut
318
319 sub dump_draft {
320 my $self = shift;
321 my $draft;
322
323 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
324 next unless defined(my $v = $self->{attrs}->{$attr_name});
325 $draft .= $attr_name . '=' . $v . "\n";
326 }
327
328 if ($self->{kwords}) {
329 $draft .= '%%VECTOR';
330 while (my ($key, $value) = each %{ $self->{kwords} }) {
331 $draft .= "\t$key\t$value";
332 }
333 $draft .= "\n";
334 }
335
336 $draft .= "\n";
337
338 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
339 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
340
341 return $draft;
342 }
343
344
345 =head2 delete
346
347 Empty document object
348
349 $doc->delete;
350
351 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
352 convinience. Document objects which go out of scope will be destroyed
353 automatically.
354
355 =cut
356
357 sub delete {
358 my $self = shift;
359
360 foreach my $data (qw/attrs dtexts stexts kwords/) {
361 delete($self->{$data});
362 }
363
364 $self->{id} = -1;
365
366 return 1;
367 }
368
369
370
371 package Search::Estraier::Condition;
372
373 use Carp qw/confess croak/;
374
375 use Search::Estraier;
376 our @ISA = qw/Search::Estraier/;
377
378 =head1 Search::Estraier::Condition
379
380 =head2 new
381
382 my $cond = new Search::HyperEstraier::Condition;
383
384 =cut
385
386 sub new {
387 my $class = shift;
388 my $self = {};
389 bless($self, $class);
390
391 $self->{max} = -1;
392 $self->{options} = 0;
393
394 $self ? return $self : return undef;
395 }
396
397
398 =head2 set_phrase
399
400 $cond->set_phrase('search phrase');
401
402 =cut
403
404 sub set_phrase {
405 my $self = shift;
406 $self->{phrase} = $self->_s( shift );
407 }
408
409
410 =head2 add_attr
411
412 $cond->add_attr('@URI STRINC /~dpavlin/');
413
414 =cut
415
416 sub add_attr {
417 my $self = shift;
418 my $attr = shift || return;
419 push @{ $self->{attrs} }, $self->_s( $attr );
420 }
421
422
423 =head2 set_order
424
425 $cond->set_order('@mdate NUMD');
426
427 =cut
428
429 sub set_order {
430 my $self = shift;
431 $self->{order} = shift;
432 }
433
434
435 =head2 set_max
436
437 $cond->set_max(42);
438
439 =cut
440
441 sub set_max {
442 my $self = shift;
443 my $max = shift;
444 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
445 $self->{max} = $max;
446 }
447
448
449 =head2 set_options
450
451 $cond->set_options( SURE => 1 );
452
453 =cut
454
455 my $options = {
456 # check N-gram keys skipping by three
457 SURE => 1 << 0,
458 # check N-gram keys skipping by two
459 USUAL => 1 << 1,
460 # without TF-IDF tuning
461 FAST => 1 << 2,
462 # with the simplified phrase
463 AGITO => 1 << 3,
464 # check every N-gram key
465 NOIDF => 1 << 4,
466 # check N-gram keys skipping by one
467 SIMPLE => 1 << 10,
468 };
469
470 sub set_options {
471 my $self = shift;
472 my $option = shift;
473 confess "unknown option" unless ($options->{$option});
474 $self->{options} ||= $options->{$option};
475 }
476
477
478 =head2 phrase
479
480 Return search phrase.
481
482 print $cond->phrase;
483
484 =cut
485
486 sub phrase {
487 my $self = shift;
488 return $self->{phrase};
489 }
490
491
492 =head2 order
493
494 Return search result order.
495
496 print $cond->order;
497
498 =cut
499
500 sub order {
501 my $self = shift;
502 return $self->{order};
503 }
504
505
506 =head2 attrs
507
508 Return search result attrs.
509
510 my @cond_attrs = $cond->attrs;
511
512 =cut
513
514 sub attrs {
515 my $self = shift;
516 #croak "attrs return array, not scalar" if (! wantarray);
517 return @{ $self->{attrs} } if ($self->{attrs});
518 }
519
520
521 =head2 max
522
523 Return maximum number of results.
524
525 print $cond->max;
526
527 C<-1> is returned for unitialized value, C<0> is unlimited.
528
529 =cut
530
531 sub max {
532 my $self = shift;
533 return $self->{max};
534 }
535
536
537 =head2 options
538
539 Return options for this condition.
540
541 print $cond->options;
542
543 Options are returned in numerical form.
544
545 =cut
546
547 sub options {
548 my $self = shift;
549 return $self->{options};
550 }
551
552
553 package Search::Estraier::ResultDocument;
554
555 use Carp qw/croak/;
556
557 #use Search::Estraier;
558 #our @ISA = qw/Search::Estraier/;
559
560 =head1 Search::Estraier::ResultDocument
561
562 =head2 new
563
564 my $rdoc = new Search::HyperEstraier::ResultDocument(
565 uri => 'http://localhost/document/uri/42',
566 attrs => {
567 foo => 1,
568 bar => 2,
569 },
570 snippet => 'this is a text of snippet'
571 keywords => 'this\tare\tkeywords'
572 );
573
574 =cut
575
576 sub new {
577 my $class = shift;
578 my $self = {@_};
579 bless($self, $class);
580
581 croak "missing uri for ResultDocument" unless defined($self->{uri});
582
583 $self ? return $self : return undef;
584 }
585
586
587 =head2 uri
588
589 Return URI of result document
590
591 print $rdoc->uri;
592
593 =cut
594
595 sub uri {
596 my $self = shift;
597 return $self->{uri};
598 }
599
600
601 =head2 attr_names
602
603 Returns array with attribute names from result document object.
604
605 my @attrs = $rdoc->attr_names;
606
607 =cut
608
609 sub attr_names {
610 my $self = shift;
611 croak "attr_names return array, not scalar" if (! wantarray);
612 return sort keys %{ $self->{attrs} };
613 }
614
615
616 =head2 attr
617
618 Returns value of an attribute.
619
620 my $value = $rdoc->attr( 'attribute' );
621
622 =cut
623
624 sub attr {
625 my $self = shift;
626 my $name = shift || return;
627 return $self->{attrs}->{ $name };
628 }
629
630
631 =head2 snippet
632
633 Return snippet from result document
634
635 print $rdoc->snippet;
636
637 =cut
638
639 sub snippet {
640 my $self = shift;
641 return $self->{snippet};
642 }
643
644
645 =head2 keywords
646
647 Return keywords from result document
648
649 print $rdoc->keywords;
650
651 =cut
652
653 sub keywords {
654 my $self = shift;
655 return $self->{keywords};
656 }
657
658
659 package Search::Estraier::NodeResult;
660
661 use Carp qw/croak/;
662
663 #use Search::Estraier;
664 #our @ISA = qw/Search::Estraier/;
665
666 =head1 Search::Estraier::NodeResult
667
668 =head2 new
669
670 my $res = new Search::HyperEstraier::NodeResult(
671 docs => @array_of_rdocs,
672 hits => %hash_with_hints,
673 );
674
675 =cut
676
677 sub new {
678 my $class = shift;
679 my $self = {@_};
680 bless($self, $class);
681
682 foreach my $f (qw/docs hints/) {
683 croak "missing $f for ResultDocument" unless defined($self->{$f});
684 }
685
686 $self ? return $self : return undef;
687 }
688
689
690 =head2 doc_num
691
692 Return number of documents
693
694 print $res->doc_num;
695
696 =cut
697
698 sub doc_num {
699 my $self = shift;
700 return $#{$self->{docs}} + 1;
701 }
702
703
704 =head2 get_doc
705
706 Return single document
707
708 my $doc = $res->get_doc( 42 );
709
710 Returns undef if document doesn't exist.
711
712 =cut
713
714 sub get_doc {
715 my $self = shift;
716 my $num = shift;
717 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
718 return undef if ($num < 0 || $num > $self->{docs});
719 return $self->{docs}->[$num];
720 }
721
722
723 =head2 hint
724
725 Return specific hint from results.
726
727 print $rec->hint( 'VERSION' );
728
729 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
730 C<TIME>, C<LINK#n>, C<VIEW>.
731
732 =cut
733
734 sub hint {
735 my $self = shift;
736 my $key = shift || return;
737 return $self->{hints}->{$key};
738 }
739
740 =head2 hints
741
742 More perlish version of C<hint>. This one returns hash.
743
744 my %hints = $rec->hints;
745
746 =cut
747
748 sub hints {
749 my $self = shift;
750 return $self->{hints};
751 }
752
753 package Search::Estraier::Node;
754
755 use Carp qw/carp croak confess/;
756 use URI;
757 use MIME::Base64;
758 use IO::Socket::INET;
759 use URI::Escape qw/uri_escape/;
760
761 =head1 Search::Estraier::Node
762
763 =head2 new
764
765 my $node = new Search::HyperEstraier::Node;
766
767 or optionally with C<url> as parametar
768
769 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
770
771 or in more verbose form
772
773 my $node = new Search::HyperEstraier::Node(
774 url => 'http://localhost:1978/node/test',
775 debug => 1,
776 croak_on_error => 1
777 );
778
779 with following arguments:
780
781 =over 4
782
783 =item url
784
785 URL to node
786
787 =item debug
788
789 dumps a B<lot> of debugging output
790
791 =item croak_on_error
792
793 very helpful during development. It will croak on all errors instead of
794 silently returning C<-1> (which is convention of Hyper Estraier API in other
795 languages).
796
797 =back
798
799 =cut
800
801 sub new {
802 my $class = shift;
803 my $self = {
804 pxport => -1,
805 timeout => 0, # this used to be -1
806 dnum => -1,
807 wnum => -1,
808 size => -1.0,
809 wwidth => 480,
810 hwidth => 96,
811 awidth => 96,
812 status => -1,
813 };
814 bless($self, $class);
815
816 if ($#_ == 0) {
817 $self->{url} = shift;
818 } else {
819 my $args = {@_};
820
821 %$self = ( %$self, @_ );
822
823 warn "## Node debug on\n" if ($self->{debug});
824 }
825
826 $self ? return $self : return undef;
827 }
828
829
830 =head2 set_url
831
832 Specify URL to node server
833
834 $node->set_url('http://localhost:1978');
835
836 =cut
837
838 sub set_url {
839 my $self = shift;
840 $self->{url} = shift;
841 }
842
843
844 =head2 set_proxy
845
846 Specify proxy server to connect to node server
847
848 $node->set_proxy('proxy.example.com', 8080);
849
850 =cut
851
852 sub set_proxy {
853 my $self = shift;
854 my ($host,$port) = @_;
855 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
856 $self->{pxhost} = $host;
857 $self->{pxport} = $port;
858 }
859
860
861 =head2 set_timeout
862
863 Specify timeout of connection in seconds
864
865 $node->set_timeout( 15 );
866
867 =cut
868
869 sub set_timeout {
870 my $self = shift;
871 my $sec = shift;
872 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
873 $self->{timeout} = $sec;
874 }
875
876
877 =head2 set_auth
878
879 Specify name and password for authentication to node server.
880
881 $node->set_auth('clint','eastwood');
882
883 =cut
884
885 sub set_auth {
886 my $self = shift;
887 my ($login,$passwd) = @_;
888 my $basic_auth = encode_base64( "$login:$passwd" );
889 chomp($basic_auth);
890 $self->{auth} = $basic_auth;
891 }
892
893
894 =head2 status
895
896 Return status code of last request.
897
898 print $node->status;
899
900 C<-1> means connection failure.
901
902 =cut
903
904 sub status {
905 my $self = shift;
906 return $self->{status};
907 }
908
909
910 =head2 put_doc
911
912 Add a document
913
914 $node->put_doc( $document_draft ) or die "can't add document";
915
916 Return true on success or false on failture.
917
918 =cut
919
920 sub put_doc {
921 my $self = shift;
922 my $doc = shift || return;
923 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
924 $self->shuttle_url( $self->{url} . '/put_doc',
925 'text/x-estraier-draft',
926 $doc->dump_draft,
927 undef
928 ) == 200;
929 }
930
931
932 =head2 out_doc
933
934 Remove a document
935
936 $node->out_doc( document_id ) or "can't remove document";
937
938 Return true on success or false on failture.
939
940 =cut
941
942 sub out_doc {
943 my $self = shift;
944 my $id = shift || return;
945 return unless ($self->{url});
946 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
947 $self->shuttle_url( $self->{url} . '/out_doc',
948 'application/x-www-form-urlencoded',
949 "id=$id",
950 undef
951 ) == 200;
952 }
953
954
955 =head2 out_doc_by_uri
956
957 Remove a registrated document using it's uri
958
959 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
960
961 Return true on success or false on failture.
962
963 =cut
964
965 sub out_doc_by_uri {
966 my $self = shift;
967 my $uri = shift || return;
968 return unless ($self->{url});
969 $self->shuttle_url( $self->{url} . '/out_doc',
970 'application/x-www-form-urlencoded',
971 "uri=" . uri_escape($uri),
972 undef
973 ) == 200;
974 }
975
976
977 =head2 edit_doc
978
979 Edit attributes of a document
980
981 $node->edit_doc( $document_draft ) or die "can't edit document";
982
983 Return true on success or false on failture.
984
985 =cut
986
987 sub edit_doc {
988 my $self = shift;
989 my $doc = shift || return;
990 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
991 $self->shuttle_url( $self->{url} . '/edit_doc',
992 'text/x-estraier-draft',
993 $doc->dump_draft,
994 undef
995 ) == 200;
996 }
997
998
999 =head2 get_doc
1000
1001 Retreive document
1002
1003 my $doc = $node->get_doc( document_id ) or die "can't get document";
1004
1005 Return true on success or false on failture.
1006
1007 =cut
1008
1009 sub get_doc {
1010 my $self = shift;
1011 my $id = shift || return;
1012 return $self->_fetch_doc( id => $id );
1013 }
1014
1015
1016 =head2 get_doc_by_uri
1017
1018 Retreive document
1019
1020 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1021
1022 Return true on success or false on failture.
1023
1024 =cut
1025
1026 sub get_doc_by_uri {
1027 my $self = shift;
1028 my $uri = shift || return;
1029 return $self->_fetch_doc( uri => $uri );
1030 }
1031
1032
1033 =head2 get_doc_attr
1034
1035 Retrieve the value of an atribute from object
1036
1037 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1038 die "can't get document attribute";
1039
1040 =cut
1041
1042 sub get_doc_attr {
1043 my $self = shift;
1044 my ($id,$name) = @_;
1045 return unless ($id && $name);
1046 return $self->_fetch_doc( id => $id, attr => $name );
1047 }
1048
1049
1050 =head2 get_doc_attr_by_uri
1051
1052 Retrieve the value of an atribute from object
1053
1054 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1055 die "can't get document attribute";
1056
1057 =cut
1058
1059 sub get_doc_attr_by_uri {
1060 my $self = shift;
1061 my ($uri,$name) = @_;
1062 return unless ($uri && $name);
1063 return $self->_fetch_doc( uri => $uri, attr => $name );
1064 }
1065
1066
1067 =head2 etch_doc
1068
1069 Exctract document keywords
1070
1071 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1072
1073 =cut
1074
1075 sub etch_doc {
1076 my $self = shift;
1077 my $id = shift || return;
1078 return $self->_fetch_doc( id => $id, etch => 1 );
1079 }
1080
1081 =head2 etch_doc_by_uri
1082
1083 Retreive document
1084
1085 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1086
1087 Return true on success or false on failture.
1088
1089 =cut
1090
1091 sub etch_doc_by_uri {
1092 my $self = shift;
1093 my $uri = shift || return;
1094 return $self->_fetch_doc( uri => $uri, etch => 1 );
1095 }
1096
1097
1098 =head2 uri_to_id
1099
1100 Get ID of document specified by URI
1101
1102 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1103
1104 =cut
1105
1106 sub uri_to_id {
1107 my $self = shift;
1108 my $uri = shift || return;
1109 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1110 }
1111
1112
1113 =head2 _fetch_doc
1114
1115 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1116 C<etch_doc>, C<etch_doc_by_uri>.
1117
1118 # this will decode received draft into Search::Estraier::Document object
1119 my $doc = $node->_fetch_doc( id => 42 );
1120 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1121
1122 # to extract keywords, add etch
1123 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1124 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1125
1126 # to get document attrubute add attr
1127 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1128 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1129
1130 # more general form which allows implementation of
1131 # uri_to_id
1132 my $id = $node->_fetch_doc(
1133 uri => 'file:///document/uri/42',
1134 path => '/uri_to_id',
1135 chomp_resbody => 1
1136 );
1137
1138 =cut
1139
1140 sub _fetch_doc {
1141 my $self = shift;
1142 my $a = {@_};
1143 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1144
1145 my ($arg, $resbody);
1146
1147 my $path = $a->{path} || '/get_doc';
1148 $path = '/etch_doc' if ($a->{etch});
1149
1150 if ($a->{id}) {
1151 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1152 $arg = 'id=' . $a->{id};
1153 } elsif ($a->{uri}) {
1154 $arg = 'uri=' . uri_escape($a->{uri});
1155 } else {
1156 confess "unhandled argument. Need id or uri.";
1157 }
1158
1159 if ($a->{attr}) {
1160 $path = '/get_doc_attr';
1161 $arg .= '&attr=' . uri_escape($a->{attr});
1162 $a->{chomp_resbody} = 1;
1163 }
1164
1165 my $rv = $self->shuttle_url( $self->{url} . $path,
1166 'application/x-www-form-urlencoded',
1167 $arg,
1168 \$resbody,
1169 );
1170
1171 return if ($rv != 200);
1172
1173 if ($a->{etch}) {
1174 $self->{kwords} = {};
1175 return +{} unless ($resbody);
1176 foreach my $l (split(/\n/, $resbody)) {
1177 my ($k,$v) = split(/\t/, $l, 2);
1178 $self->{kwords}->{$k} = $v if ($v);
1179 }
1180 return $self->{kwords};
1181 } elsif ($a->{chomp_resbody}) {
1182 return unless (defined($resbody));
1183 chomp($resbody);
1184 return $resbody;
1185 } else {
1186 return new Search::Estraier::Document($resbody);
1187 }
1188 }
1189
1190
1191 =head2 name
1192
1193 my $node_name = $node->name;
1194
1195 =cut
1196
1197 sub name {
1198 my $self = shift;
1199 $self->_set_info unless ($self->{name});
1200 return $self->{name};
1201 }
1202
1203
1204 =head2 label
1205
1206 my $node_label = $node->label;
1207
1208 =cut
1209
1210 sub label {
1211 my $self = shift;
1212 $self->_set_info unless ($self->{label});
1213 return $self->{label};
1214 }
1215
1216
1217 =head2 doc_num
1218
1219 my $documents_in_node = $node->doc_num;
1220
1221 =cut
1222
1223 sub doc_num {
1224 my $self = shift;
1225 $self->_set_info if ($self->{dnum} < 0);
1226 return $self->{dnum};
1227 }
1228
1229
1230 =head2 word_num
1231
1232 my $words_in_node = $node->word_num;
1233
1234 =cut
1235
1236 sub word_num {
1237 my $self = shift;
1238 $self->_set_info if ($self->{wnum} < 0);
1239 return $self->{wnum};
1240 }
1241
1242
1243 =head2 size
1244
1245 my $node_size = $node->size;
1246
1247 =cut
1248
1249 sub size {
1250 my $self = shift;
1251 $self->_set_info if ($self->{size} < 0);
1252 return $self->{size};
1253 }
1254
1255
1256 =head2 search
1257
1258 Search documents which match condition
1259
1260 my $nres = $node->search( $cond, $depth );
1261
1262 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1263 depth for meta search.
1264
1265 Function results C<Search::Estraier::NodeResult> object.
1266
1267 =cut
1268
1269 sub search {
1270 my $self = shift;
1271 my ($cond, $depth) = @_;
1272 return unless ($cond && defined($depth) && $self->{url});
1273 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1274 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1275
1276 my $resbody;
1277
1278 my $rv = $self->shuttle_url( $self->{url} . '/search',
1279 'application/x-www-form-urlencoded',
1280 $self->cond_to_query( $cond, $depth ),
1281 \$resbody,
1282 );
1283 return if ($rv != 200);
1284
1285 my (@docs, $hints);
1286
1287 my @lines = split(/\n/, $resbody);
1288 return unless (@lines);
1289
1290 my $border = $lines[0];
1291 my $isend = 0;
1292 my $lnum = 1;
1293
1294 while ( $lnum <= $#lines ) {
1295 my $line = $lines[$lnum];
1296 $lnum++;
1297
1298 #warn "## $line\n";
1299 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1300 $isend = $1;
1301 last;
1302 }
1303
1304 if ($line =~ /\t/) {
1305 my ($k,$v) = split(/\t/, $line, 2);
1306 $hints->{$k} = $v;
1307 }
1308 }
1309
1310 my $snum = $lnum;
1311
1312 while( ! $isend && $lnum <= $#lines ) {
1313 my $line = $lines[$lnum];
1314 #warn "# $lnum: $line\n";
1315 $lnum++;
1316
1317 if ($line && $line =~ m/^\Q$border\E/) {
1318 if ($lnum > $snum) {
1319 my $rdattrs;
1320 my $rdvector;
1321 my $rdsnippet;
1322
1323 my $rlnum = $snum;
1324 while ($rlnum < $lnum - 1 ) {
1325 #my $rdline = $self->_s($lines[$rlnum]);
1326 my $rdline = $lines[$rlnum];
1327 $rlnum++;
1328 last unless ($rdline);
1329 if ($rdline =~ /^%/) {
1330 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1331 } elsif($rdline =~ /=/) {
1332 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1333 } else {
1334 confess "invalid format of response";
1335 }
1336 }
1337 while($rlnum < $lnum - 1) {
1338 my $rdline = $lines[$rlnum];
1339 $rlnum++;
1340 $rdsnippet .= "$rdline\n";
1341 }
1342 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1343 if (my $rduri = $rdattrs->{'@uri'}) {
1344 push @docs, new Search::Estraier::ResultDocument(
1345 uri => $rduri,
1346 attrs => $rdattrs,
1347 snippet => $rdsnippet,
1348 keywords => $rdvector,
1349 );
1350 }
1351 }
1352 $snum = $lnum;
1353 #warn "### $line\n";
1354 $isend = 1 if ($line =~ /:END$/);
1355 }
1356
1357 }
1358
1359 if (! $isend) {
1360 warn "received result doesn't have :END\n$resbody";
1361 return;
1362 }
1363
1364 #warn Dumper(\@docs, $hints);
1365
1366 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1367 }
1368
1369
1370 =head2 cond_to_query
1371
1372 Return URI encoded string generated from Search::Estraier::Condition
1373
1374 my $args = $node->cond_to_query( $cond, $depth );
1375
1376 =cut
1377
1378 sub cond_to_query {
1379 my $self = shift;
1380
1381 my $cond = shift || return;
1382 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1383 my $depth = shift;
1384
1385 my @args;
1386
1387 if (my $phrase = $cond->phrase) {
1388 push @args, 'phrase=' . uri_escape($phrase);
1389 }
1390
1391 if (my @attrs = $cond->attrs) {
1392 for my $i ( 0 .. $#attrs ) {
1393 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1394 }
1395 }
1396
1397 if (my $order = $cond->order) {
1398 push @args, 'order=' . uri_escape($order);
1399 }
1400
1401 if (my $max = $cond->max) {
1402 push @args, 'max=' . $max;
1403 } else {
1404 push @args, 'max=' . (1 << 30);
1405 }
1406
1407 if (my $options = $cond->options) {
1408 push @args, 'options=' . $options;
1409 }
1410
1411 push @args, 'depth=' . $depth if ($depth);
1412 push @args, 'wwidth=' . $self->{wwidth};
1413 push @args, 'hwidth=' . $self->{hwidth};
1414 push @args, 'awidth=' . $self->{awidth};
1415
1416 return join('&', @args);
1417 }
1418
1419
1420 =head2 shuttle_url
1421
1422 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1423 master.
1424
1425 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1426
1427 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1428 body will be saved within object.
1429
1430 =cut
1431
1432 use LWP::UserAgent;
1433
1434 sub shuttle_url {
1435 my $self = shift;
1436
1437 my ($url, $content_type, $reqbody, $resbody) = @_;
1438
1439 $self->{status} = -1;
1440
1441 warn "## $url\n" if ($self->{debug});
1442
1443 $url = new URI($url);
1444 if (
1445 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1446 !$url->host || !$url->port || $url->port < 1
1447 ) {
1448 carp "can't parse $url\n";
1449 return -1;
1450 }
1451
1452 my $ua = LWP::UserAgent->new;
1453 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1454
1455 my $req;
1456 if ($reqbody) {
1457 $req = HTTP::Request->new(POST => $url);
1458 } else {
1459 $req = HTTP::Request->new(GET => $url);
1460 }
1461
1462 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1463 $req->headers->header( 'Connection', 'close' );
1464 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1465 $req->content_type( $content_type );
1466
1467 warn $req->headers->as_string,"\n" if ($self->{debug});
1468
1469 if ($reqbody) {
1470 warn "$reqbody\n" if ($self->{debug});
1471 $req->content( $reqbody );
1472 }
1473
1474 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1475
1476 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1477
1478 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1479
1480 if (! $res->is_success) {
1481 if ($self->{croak_on_error}) {
1482 croak("can't get $url: ",$res->status_line);
1483 } else {
1484 return -1;
1485 }
1486 }
1487
1488 $$resbody .= $res->content;
1489
1490 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1491
1492 return $self->{status};
1493 }
1494
1495
1496 =head2 set_snippet_width
1497
1498 Set width of snippets in results
1499
1500 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1501
1502 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1503 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1504
1505 C<$hwidth> specified width of strings from beginning of string. Default
1506 value is C<96>. Negative or zero value keep previous value.
1507
1508 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1509 If negative of zero value is provided previous value is kept unchanged.
1510
1511 =cut
1512
1513 sub set_snippet_width {
1514 my $self = shift;
1515
1516 my ($wwidth, $hwidth, $awidth) = @_;
1517 $self->{wwidth} = $wwidth;
1518 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1519 $self->{awidth} = $awidth if ($awidth >= 0);
1520 }
1521
1522
1523 =head2 set_user
1524
1525 Manage users of node
1526
1527 $node->set_user( 'name', $mode );
1528
1529 C<$mode> can be one of:
1530
1531 =over 4
1532
1533 =item 0
1534
1535 delete account
1536
1537 =item 1
1538
1539 set administrative right for user
1540
1541 =item 2
1542
1543 set user account as guest
1544
1545 =back
1546
1547 Return true on success, otherwise false.
1548
1549 =cut
1550
1551 sub set_user {
1552 my $self = shift;
1553 my ($name, $mode) = @_;
1554
1555 return unless ($self->{url});
1556 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1557
1558 $self->shuttle_url( $self->{url} . '/_set_user',
1559 'text/plain',
1560 'name=' . uri_escape($name) . '&mode=' . $mode,
1561 undef
1562 ) == 200;
1563 }
1564
1565
1566 =head2 set_link
1567
1568 Manage node links
1569
1570 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1571
1572 If C<$credit> is negative, link is removed.
1573
1574 =cut
1575
1576 sub set_link {
1577 my $self = shift;
1578 my ($url, $label, $credit) = @_;
1579
1580 return unless ($self->{url});
1581 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1582
1583 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1584 $reqbody .= '&credit=' . $credit if ($credit > 0);
1585
1586 $self->shuttle_url( $self->{url} . '/_set_link',
1587 'application/x-www-form-urlencoded',
1588 $reqbody,
1589 undef
1590 ) == 200;
1591 }
1592
1593
1594 =head1 PRIVATE METHODS
1595
1596 You could call those directly, but you don't have to. I hope.
1597
1598 =head2 _set_info
1599
1600 Set information for node
1601
1602 $node->_set_info;
1603
1604 =cut
1605
1606 sub _set_info {
1607 my $self = shift;
1608
1609 $self->{status} = -1;
1610 return unless ($self->{url});
1611
1612 my $resbody;
1613 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1614 'text/plain',
1615 undef,
1616 \$resbody,
1617 );
1618
1619 return if ($rv != 200 || !$resbody);
1620
1621 # it seems that response can have multiple line endings
1622 $resbody =~ s/[\r\n]+$//;
1623
1624 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1625 split(/\t/, $resbody, 5);
1626
1627 }
1628
1629 ###
1630
1631 =head1 EXPORT
1632
1633 Nothing.
1634
1635 =head1 SEE ALSO
1636
1637 L<http://hyperestraier.sourceforge.net/>
1638
1639 Hyper Estraier Ruby interface on which this module is based.
1640
1641 =head1 AUTHOR
1642
1643 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1644
1645
1646 =head1 COPYRIGHT AND LICENSE
1647
1648 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1649
1650 This library is free software; you can redistribute it and/or modify
1651 it under the GPL v2 or later.
1652
1653 =cut
1654
1655 1;

  ViewVC Help
Powered by ViewVC 1.1.26