/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 98 - (show annotations)
Sat Jan 28 19:18:13 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 30037 byte(s)
improved $cond->set_options to support one or more arguments and reset
options on each call, e.g.
$cond->set_options('SURE') or
$cond->set_options(qw/SURE NOIDF/)
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 if (defined($nres)) {
54 # for each document in results
55 for my $i ( 0 ... $nres->doc_num - 1 ) {
56 # get result document
57 my $rdoc = $nres->get_doc($i);
58 # display attribte
59 print "URI: ", $rdoc->attr('@uri'),"\n";
60 print "Title: ", $rdoc->attr('@title'),"\n";
61 print $rdoc->snippet,"\n";
62 }
63 } else {
64 die "error: ", $node->status,"\n";
65 }
66
67 =head1 DESCRIPTION
68
69 This module is implementation of node API of Hyper Estraier. Since it's
70 perl-only module with dependencies only on standard perl modules, it will
71 run on all platforms on which perl runs. It doesn't require compilation
72 or Hyper Estraier development files on target machine.
73
74 It is implemented as multiple packages which closly resamble Ruby
75 implementation. It also includes methods to manage nodes.
76
77 There are few examples in C<scripts> directory of this distribution.
78
79 =cut
80
81 =head1 Inheritable common methods
82
83 This methods should really move somewhere else.
84
85 =head2 _s
86
87 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88
89 my $text = $self->_s(" this is a text ");
90 $text = 'this is a text';
91
92 =cut
93
94 sub _s {
95 my $text = $_[1];
96 return unless defined($text);
97 $text =~ s/\s\s+/ /gs;
98 $text =~ s/^\s+//;
99 $text =~ s/\s+$//;
100 return $text;
101 }
102
103 package Search::Estraier::Document;
104
105 use Carp qw/croak confess/;
106
107 use Search::Estraier;
108 our @ISA = qw/Search::Estraier/;
109
110 =head1 Search::Estraier::Document
111
112 This class implements Document which is collection of attributes
113 (key=value), vectors (also key value) display text and hidden text.
114
115
116 =head2 new
117
118 Create new document, empty or from draft.
119
120 my $doc = new Search::HyperEstraier::Document;
121 my $doc2 = new Search::HyperEstraier::Document( $draft );
122
123 =cut
124
125 sub new {
126 my $class = shift;
127 my $self = {};
128 bless($self, $class);
129
130 $self->{id} = -1;
131
132 my $draft = shift;
133
134 if ($draft) {
135 my $in_text = 0;
136 foreach my $line (split(/\n/, $draft)) {
137
138 if ($in_text) {
139 if ($line =~ /^\t/) {
140 push @{ $self->{htexts} }, substr($line, 1);
141 } else {
142 push @{ $self->{dtexts} }, $line;
143 }
144 next;
145 }
146
147 if ($line =~ m/^%VECTOR\t(.+)$/) {
148 my @fields = split(/\t/, $1);
149 for my $i ( 0 .. ($#fields - 1) ) {
150 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
151 $i++;
152 }
153 next;
154 } elsif ($line =~ m/^%/) {
155 # What is this? comment?
156 #warn "$line\n";
157 next;
158 } elsif ($line =~ m/^$/) {
159 $in_text = 1;
160 next;
161 } elsif ($line =~ m/^(.+)=(.*)$/) {
162 $self->{attrs}->{ $1 } = $2;
163 next;
164 }
165
166 warn "draft ignored: '$line'\n";
167 }
168 }
169
170 $self ? return $self : return undef;
171 }
172
173
174 =head2 add_attr
175
176 Add an attribute.
177
178 $doc->add_attr( name => 'value' );
179
180 Delete attribute using
181
182 $doc->add_attr( name => undef );
183
184 =cut
185
186 sub add_attr {
187 my $self = shift;
188 my $attrs = {@_};
189
190 while (my ($name, $value) = each %{ $attrs }) {
191 if (! defined($value)) {
192 delete( $self->{attrs}->{ $self->_s($name) } );
193 } else {
194 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
195 }
196 }
197
198 return 1;
199 }
200
201
202 =head2 add_text
203
204 Add a sentence of text.
205
206 $doc->add_text('this is example text to display');
207
208 =cut
209
210 sub add_text {
211 my $self = shift;
212 my $text = shift;
213 return unless defined($text);
214
215 push @{ $self->{dtexts} }, $self->_s($text);
216 }
217
218
219 =head2 add_hidden_text
220
221 Add a hidden sentence.
222
223 $doc->add_hidden_text('this is example text just for search');
224
225 =cut
226
227 sub add_hidden_text {
228 my $self = shift;
229 my $text = shift;
230 return unless defined($text);
231
232 push @{ $self->{htexts} }, $self->_s($text);
233 }
234
235
236 =head2 id
237
238 Get the ID number of document. If the object has never been registred, C<-1> is returned.
239
240 print $doc->id;
241
242 =cut
243
244 sub id {
245 my $self = shift;
246 return $self->{id};
247 }
248
249
250 =head2 attr_names
251
252 Returns array with attribute names from document object.
253
254 my @attrs = $doc->attr_names;
255
256 =cut
257
258 sub attr_names {
259 my $self = shift;
260 return unless ($self->{attrs});
261 #croak "attr_names return array, not scalar" if (! wantarray);
262 return sort keys %{ $self->{attrs} };
263 }
264
265
266 =head2 attr
267
268 Returns value of an attribute.
269
270 my $value = $doc->attr( 'attribute' );
271
272 =cut
273
274 sub attr {
275 my $self = shift;
276 my $name = shift;
277 return unless (defined($name) && $self->{attrs});
278 return $self->{attrs}->{ $name };
279 }
280
281
282 =head2 texts
283
284 Returns array with text sentences.
285
286 my @texts = $doc->texts;
287
288 =cut
289
290 sub texts {
291 my $self = shift;
292 #confess "texts return array, not scalar" if (! wantarray);
293 return @{ $self->{dtexts} } if ($self->{dtexts});
294 }
295
296
297 =head2 cat_texts
298
299 Return whole text as single scalar.
300
301 my $text = $doc->cat_texts;
302
303 =cut
304
305 sub cat_texts {
306 my $self = shift;
307 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
308 }
309
310
311 =head2 dump_draft
312
313 Dump draft data from document object.
314
315 print $doc->dump_draft;
316
317 =cut
318
319 sub dump_draft {
320 my $self = shift;
321 my $draft;
322
323 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
324 next unless defined(my $v = $self->{attrs}->{$attr_name});
325 $draft .= $attr_name . '=' . $v . "\n";
326 }
327
328 if ($self->{kwords}) {
329 $draft .= '%%VECTOR';
330 while (my ($key, $value) = each %{ $self->{kwords} }) {
331 $draft .= "\t$key\t$value";
332 }
333 $draft .= "\n";
334 }
335
336 $draft .= "\n";
337
338 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
339 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
340
341 return $draft;
342 }
343
344
345 =head2 delete
346
347 Empty document object
348
349 $doc->delete;
350
351 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
352 convinience. Document objects which go out of scope will be destroyed
353 automatically.
354
355 =cut
356
357 sub delete {
358 my $self = shift;
359
360 foreach my $data (qw/attrs dtexts stexts kwords/) {
361 delete($self->{$data});
362 }
363
364 $self->{id} = -1;
365
366 return 1;
367 }
368
369
370
371 package Search::Estraier::Condition;
372
373 use Carp qw/carp confess croak/;
374
375 use Search::Estraier;
376 our @ISA = qw/Search::Estraier/;
377
378 =head1 Search::Estraier::Condition
379
380 =head2 new
381
382 my $cond = new Search::HyperEstraier::Condition;
383
384 =cut
385
386 sub new {
387 my $class = shift;
388 my $self = {};
389 bless($self, $class);
390
391 $self->{max} = -1;
392 $self->{options} = 0;
393
394 $self ? return $self : return undef;
395 }
396
397
398 =head2 set_phrase
399
400 $cond->set_phrase('search phrase');
401
402 =cut
403
404 sub set_phrase {
405 my $self = shift;
406 $self->{phrase} = $self->_s( shift );
407 }
408
409
410 =head2 add_attr
411
412 $cond->add_attr('@URI STRINC /~dpavlin/');
413
414 =cut
415
416 sub add_attr {
417 my $self = shift;
418 my $attr = shift || return;
419 push @{ $self->{attrs} }, $self->_s( $attr );
420 }
421
422
423 =head2 set_order
424
425 $cond->set_order('@mdate NUMD');
426
427 =cut
428
429 sub set_order {
430 my $self = shift;
431 $self->{order} = shift;
432 }
433
434
435 =head2 set_max
436
437 $cond->set_max(42);
438
439 =cut
440
441 sub set_max {
442 my $self = shift;
443 my $max = shift;
444 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
445 $self->{max} = $max;
446 }
447
448
449 =head2 set_options
450
451 $cond->set_options( 'SURE' );
452
453 $cond->set_options( qw/AGITO NOIDF SIMPLE/ );
454
455 Possible options are:
456
457 =over 8
458
459 =item SURE
460
461 check every N-gram
462
463 =item USUAL
464
465 check every second N-gram
466
467 =item FAST
468
469 check every third N-gram
470
471 =item AGITO
472
473 check every fourth N-gram
474
475 =item NOIDF
476
477 don't perform TF-IDF tuning
478
479 =item SIMPLE
480
481 use simplified query phrase
482
483 =back
484
485 Skipping N-grams will speed up search, but reduce accuracy. Every call to C<set_options> will reset previous
486 options;
487
488 This option changed in version C<0.04> of this module. It's backwards compatibile.
489
490 =cut
491
492 my $options = {
493 SURE => 1 << 0,
494 USUAL => 1 << 1,
495 FAST => 1 << 2,
496 AGITO => 1 << 3,
497 NOIDF => 1 << 4,
498 SIMPLE => 1 << 10,
499 };
500
501 sub set_options {
502 my $self = shift;
503 my $opt = 0;
504 foreach my $option (@_) {
505 my $mask;
506 unless ($mask = $options->{$option}) {
507 if ($option eq '1') {
508 next;
509 } else {
510 croak "unknown option $option";
511 }
512 }
513 $opt += $mask;
514 }
515 $self->{options} = $opt;
516 }
517
518
519 =head2 phrase
520
521 Return search phrase.
522
523 print $cond->phrase;
524
525 =cut
526
527 sub phrase {
528 my $self = shift;
529 return $self->{phrase};
530 }
531
532
533 =head2 order
534
535 Return search result order.
536
537 print $cond->order;
538
539 =cut
540
541 sub order {
542 my $self = shift;
543 return $self->{order};
544 }
545
546
547 =head2 attrs
548
549 Return search result attrs.
550
551 my @cond_attrs = $cond->attrs;
552
553 =cut
554
555 sub attrs {
556 my $self = shift;
557 #croak "attrs return array, not scalar" if (! wantarray);
558 return @{ $self->{attrs} } if ($self->{attrs});
559 }
560
561
562 =head2 max
563
564 Return maximum number of results.
565
566 print $cond->max;
567
568 C<-1> is returned for unitialized value, C<0> is unlimited.
569
570 =cut
571
572 sub max {
573 my $self = shift;
574 return $self->{max};
575 }
576
577
578 =head2 options
579
580 Return options for this condition.
581
582 print $cond->options;
583
584 Options are returned in numerical form.
585
586 =cut
587
588 sub options {
589 my $self = shift;
590 return $self->{options};
591 }
592
593
594 package Search::Estraier::ResultDocument;
595
596 use Carp qw/croak/;
597
598 #use Search::Estraier;
599 #our @ISA = qw/Search::Estraier/;
600
601 =head1 Search::Estraier::ResultDocument
602
603 =head2 new
604
605 my $rdoc = new Search::HyperEstraier::ResultDocument(
606 uri => 'http://localhost/document/uri/42',
607 attrs => {
608 foo => 1,
609 bar => 2,
610 },
611 snippet => 'this is a text of snippet'
612 keywords => 'this\tare\tkeywords'
613 );
614
615 =cut
616
617 sub new {
618 my $class = shift;
619 my $self = {@_};
620 bless($self, $class);
621
622 croak "missing uri for ResultDocument" unless defined($self->{uri});
623
624 $self ? return $self : return undef;
625 }
626
627
628 =head2 uri
629
630 Return URI of result document
631
632 print $rdoc->uri;
633
634 =cut
635
636 sub uri {
637 my $self = shift;
638 return $self->{uri};
639 }
640
641
642 =head2 attr_names
643
644 Returns array with attribute names from result document object.
645
646 my @attrs = $rdoc->attr_names;
647
648 =cut
649
650 sub attr_names {
651 my $self = shift;
652 croak "attr_names return array, not scalar" if (! wantarray);
653 return sort keys %{ $self->{attrs} };
654 }
655
656
657 =head2 attr
658
659 Returns value of an attribute.
660
661 my $value = $rdoc->attr( 'attribute' );
662
663 =cut
664
665 sub attr {
666 my $self = shift;
667 my $name = shift || return;
668 return $self->{attrs}->{ $name };
669 }
670
671
672 =head2 snippet
673
674 Return snippet from result document
675
676 print $rdoc->snippet;
677
678 =cut
679
680 sub snippet {
681 my $self = shift;
682 return $self->{snippet};
683 }
684
685
686 =head2 keywords
687
688 Return keywords from result document
689
690 print $rdoc->keywords;
691
692 =cut
693
694 sub keywords {
695 my $self = shift;
696 return $self->{keywords};
697 }
698
699
700 package Search::Estraier::NodeResult;
701
702 use Carp qw/croak/;
703
704 #use Search::Estraier;
705 #our @ISA = qw/Search::Estraier/;
706
707 =head1 Search::Estraier::NodeResult
708
709 =head2 new
710
711 my $res = new Search::HyperEstraier::NodeResult(
712 docs => @array_of_rdocs,
713 hits => %hash_with_hints,
714 );
715
716 =cut
717
718 sub new {
719 my $class = shift;
720 my $self = {@_};
721 bless($self, $class);
722
723 foreach my $f (qw/docs hints/) {
724 croak "missing $f for ResultDocument" unless defined($self->{$f});
725 }
726
727 $self ? return $self : return undef;
728 }
729
730
731 =head2 doc_num
732
733 Return number of documents
734
735 print $res->doc_num;
736
737 =cut
738
739 sub doc_num {
740 my $self = shift;
741 return $#{$self->{docs}} + 1;
742 }
743
744
745 =head2 get_doc
746
747 Return single document
748
749 my $doc = $res->get_doc( 42 );
750
751 Returns undef if document doesn't exist.
752
753 =cut
754
755 sub get_doc {
756 my $self = shift;
757 my $num = shift;
758 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
759 return undef if ($num < 0 || $num > $self->{docs});
760 return $self->{docs}->[$num];
761 }
762
763
764 =head2 hint
765
766 Return specific hint from results.
767
768 print $rec->hint( 'VERSION' );
769
770 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
771 C<TIME>, C<LINK#n>, C<VIEW>.
772
773 =cut
774
775 sub hint {
776 my $self = shift;
777 my $key = shift || return;
778 return $self->{hints}->{$key};
779 }
780
781 =head2 hints
782
783 More perlish version of C<hint>. This one returns hash.
784
785 my %hints = $rec->hints;
786
787 =cut
788
789 sub hints {
790 my $self = shift;
791 return $self->{hints};
792 }
793
794 package Search::Estraier::Node;
795
796 use Carp qw/carp croak confess/;
797 use URI;
798 use MIME::Base64;
799 use IO::Socket::INET;
800 use URI::Escape qw/uri_escape/;
801
802 =head1 Search::Estraier::Node
803
804 =head2 new
805
806 my $node = new Search::HyperEstraier::Node;
807
808 or optionally with C<url> as parametar
809
810 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
811
812 or in more verbose form
813
814 my $node = new Search::HyperEstraier::Node(
815 url => 'http://localhost:1978/node/test',
816 debug => 1,
817 croak_on_error => 1
818 );
819
820 with following arguments:
821
822 =over 4
823
824 =item url
825
826 URL to node
827
828 =item debug
829
830 dumps a B<lot> of debugging output
831
832 =item croak_on_error
833
834 very helpful during development. It will croak on all errors instead of
835 silently returning C<-1> (which is convention of Hyper Estraier API in other
836 languages).
837
838 =back
839
840 =cut
841
842 sub new {
843 my $class = shift;
844 my $self = {
845 pxport => -1,
846 timeout => 0, # this used to be -1
847 dnum => -1,
848 wnum => -1,
849 size => -1.0,
850 wwidth => 480,
851 hwidth => 96,
852 awidth => 96,
853 status => -1,
854 };
855 bless($self, $class);
856
857 if ($#_ == 0) {
858 $self->{url} = shift;
859 } else {
860 my $args = {@_};
861
862 %$self = ( %$self, @_ );
863
864 warn "## Node debug on\n" if ($self->{debug});
865 }
866
867 $self ? return $self : return undef;
868 }
869
870
871 =head2 set_url
872
873 Specify URL to node server
874
875 $node->set_url('http://localhost:1978');
876
877 =cut
878
879 sub set_url {
880 my $self = shift;
881 $self->{url} = shift;
882 }
883
884
885 =head2 set_proxy
886
887 Specify proxy server to connect to node server
888
889 $node->set_proxy('proxy.example.com', 8080);
890
891 =cut
892
893 sub set_proxy {
894 my $self = shift;
895 my ($host,$port) = @_;
896 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
897 $self->{pxhost} = $host;
898 $self->{pxport} = $port;
899 }
900
901
902 =head2 set_timeout
903
904 Specify timeout of connection in seconds
905
906 $node->set_timeout( 15 );
907
908 =cut
909
910 sub set_timeout {
911 my $self = shift;
912 my $sec = shift;
913 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
914 $self->{timeout} = $sec;
915 }
916
917
918 =head2 set_auth
919
920 Specify name and password for authentication to node server.
921
922 $node->set_auth('clint','eastwood');
923
924 =cut
925
926 sub set_auth {
927 my $self = shift;
928 my ($login,$passwd) = @_;
929 my $basic_auth = encode_base64( "$login:$passwd" );
930 chomp($basic_auth);
931 $self->{auth} = $basic_auth;
932 }
933
934
935 =head2 status
936
937 Return status code of last request.
938
939 print $node->status;
940
941 C<-1> means connection failure.
942
943 =cut
944
945 sub status {
946 my $self = shift;
947 return $self->{status};
948 }
949
950
951 =head2 put_doc
952
953 Add a document
954
955 $node->put_doc( $document_draft ) or die "can't add document";
956
957 Return true on success or false on failture.
958
959 =cut
960
961 sub put_doc {
962 my $self = shift;
963 my $doc = shift || return;
964 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
965 $self->shuttle_url( $self->{url} . '/put_doc',
966 'text/x-estraier-draft',
967 $doc->dump_draft,
968 undef
969 ) == 200;
970 }
971
972
973 =head2 out_doc
974
975 Remove a document
976
977 $node->out_doc( document_id ) or "can't remove document";
978
979 Return true on success or false on failture.
980
981 =cut
982
983 sub out_doc {
984 my $self = shift;
985 my $id = shift || return;
986 return unless ($self->{url});
987 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
988 $self->shuttle_url( $self->{url} . '/out_doc',
989 'application/x-www-form-urlencoded',
990 "id=$id",
991 undef
992 ) == 200;
993 }
994
995
996 =head2 out_doc_by_uri
997
998 Remove a registrated document using it's uri
999
1000 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
1001
1002 Return true on success or false on failture.
1003
1004 =cut
1005
1006 sub out_doc_by_uri {
1007 my $self = shift;
1008 my $uri = shift || return;
1009 return unless ($self->{url});
1010 $self->shuttle_url( $self->{url} . '/out_doc',
1011 'application/x-www-form-urlencoded',
1012 "uri=" . uri_escape($uri),
1013 undef
1014 ) == 200;
1015 }
1016
1017
1018 =head2 edit_doc
1019
1020 Edit attributes of a document
1021
1022 $node->edit_doc( $document_draft ) or die "can't edit document";
1023
1024 Return true on success or false on failture.
1025
1026 =cut
1027
1028 sub edit_doc {
1029 my $self = shift;
1030 my $doc = shift || return;
1031 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
1032 $self->shuttle_url( $self->{url} . '/edit_doc',
1033 'text/x-estraier-draft',
1034 $doc->dump_draft,
1035 undef
1036 ) == 200;
1037 }
1038
1039
1040 =head2 get_doc
1041
1042 Retreive document
1043
1044 my $doc = $node->get_doc( document_id ) or die "can't get document";
1045
1046 Return true on success or false on failture.
1047
1048 =cut
1049
1050 sub get_doc {
1051 my $self = shift;
1052 my $id = shift || return;
1053 return $self->_fetch_doc( id => $id );
1054 }
1055
1056
1057 =head2 get_doc_by_uri
1058
1059 Retreive document
1060
1061 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1062
1063 Return true on success or false on failture.
1064
1065 =cut
1066
1067 sub get_doc_by_uri {
1068 my $self = shift;
1069 my $uri = shift || return;
1070 return $self->_fetch_doc( uri => $uri );
1071 }
1072
1073
1074 =head2 get_doc_attr
1075
1076 Retrieve the value of an atribute from object
1077
1078 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1079 die "can't get document attribute";
1080
1081 =cut
1082
1083 sub get_doc_attr {
1084 my $self = shift;
1085 my ($id,$name) = @_;
1086 return unless ($id && $name);
1087 return $self->_fetch_doc( id => $id, attr => $name );
1088 }
1089
1090
1091 =head2 get_doc_attr_by_uri
1092
1093 Retrieve the value of an atribute from object
1094
1095 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1096 die "can't get document attribute";
1097
1098 =cut
1099
1100 sub get_doc_attr_by_uri {
1101 my $self = shift;
1102 my ($uri,$name) = @_;
1103 return unless ($uri && $name);
1104 return $self->_fetch_doc( uri => $uri, attr => $name );
1105 }
1106
1107
1108 =head2 etch_doc
1109
1110 Exctract document keywords
1111
1112 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1113
1114 =cut
1115
1116 sub etch_doc {
1117 my $self = shift;
1118 my $id = shift || return;
1119 return $self->_fetch_doc( id => $id, etch => 1 );
1120 }
1121
1122 =head2 etch_doc_by_uri
1123
1124 Retreive document
1125
1126 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1127
1128 Return true on success or false on failture.
1129
1130 =cut
1131
1132 sub etch_doc_by_uri {
1133 my $self = shift;
1134 my $uri = shift || return;
1135 return $self->_fetch_doc( uri => $uri, etch => 1 );
1136 }
1137
1138
1139 =head2 uri_to_id
1140
1141 Get ID of document specified by URI
1142
1143 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1144
1145 =cut
1146
1147 sub uri_to_id {
1148 my $self = shift;
1149 my $uri = shift || return;
1150 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1151 }
1152
1153
1154 =head2 _fetch_doc
1155
1156 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1157 C<etch_doc>, C<etch_doc_by_uri>.
1158
1159 # this will decode received draft into Search::Estraier::Document object
1160 my $doc = $node->_fetch_doc( id => 42 );
1161 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1162
1163 # to extract keywords, add etch
1164 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1165 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1166
1167 # to get document attrubute add attr
1168 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1169 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1170
1171 # more general form which allows implementation of
1172 # uri_to_id
1173 my $id = $node->_fetch_doc(
1174 uri => 'file:///document/uri/42',
1175 path => '/uri_to_id',
1176 chomp_resbody => 1
1177 );
1178
1179 =cut
1180
1181 sub _fetch_doc {
1182 my $self = shift;
1183 my $a = {@_};
1184 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1185
1186 my ($arg, $resbody);
1187
1188 my $path = $a->{path} || '/get_doc';
1189 $path = '/etch_doc' if ($a->{etch});
1190
1191 if ($a->{id}) {
1192 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1193 $arg = 'id=' . $a->{id};
1194 } elsif ($a->{uri}) {
1195 $arg = 'uri=' . uri_escape($a->{uri});
1196 } else {
1197 confess "unhandled argument. Need id or uri.";
1198 }
1199
1200 if ($a->{attr}) {
1201 $path = '/get_doc_attr';
1202 $arg .= '&attr=' . uri_escape($a->{attr});
1203 $a->{chomp_resbody} = 1;
1204 }
1205
1206 my $rv = $self->shuttle_url( $self->{url} . $path,
1207 'application/x-www-form-urlencoded',
1208 $arg,
1209 \$resbody,
1210 );
1211
1212 return if ($rv != 200);
1213
1214 if ($a->{etch}) {
1215 $self->{kwords} = {};
1216 return +{} unless ($resbody);
1217 foreach my $l (split(/\n/, $resbody)) {
1218 my ($k,$v) = split(/\t/, $l, 2);
1219 $self->{kwords}->{$k} = $v if ($v);
1220 }
1221 return $self->{kwords};
1222 } elsif ($a->{chomp_resbody}) {
1223 return unless (defined($resbody));
1224 chomp($resbody);
1225 return $resbody;
1226 } else {
1227 return new Search::Estraier::Document($resbody);
1228 }
1229 }
1230
1231
1232 =head2 name
1233
1234 my $node_name = $node->name;
1235
1236 =cut
1237
1238 sub name {
1239 my $self = shift;
1240 $self->_set_info unless ($self->{name});
1241 return $self->{name};
1242 }
1243
1244
1245 =head2 label
1246
1247 my $node_label = $node->label;
1248
1249 =cut
1250
1251 sub label {
1252 my $self = shift;
1253 $self->_set_info unless ($self->{label});
1254 return $self->{label};
1255 }
1256
1257
1258 =head2 doc_num
1259
1260 my $documents_in_node = $node->doc_num;
1261
1262 =cut
1263
1264 sub doc_num {
1265 my $self = shift;
1266 $self->_set_info if ($self->{dnum} < 0);
1267 return $self->{dnum};
1268 }
1269
1270
1271 =head2 word_num
1272
1273 my $words_in_node = $node->word_num;
1274
1275 =cut
1276
1277 sub word_num {
1278 my $self = shift;
1279 $self->_set_info if ($self->{wnum} < 0);
1280 return $self->{wnum};
1281 }
1282
1283
1284 =head2 size
1285
1286 my $node_size = $node->size;
1287
1288 =cut
1289
1290 sub size {
1291 my $self = shift;
1292 $self->_set_info if ($self->{size} < 0);
1293 return $self->{size};
1294 }
1295
1296
1297 =head2 search
1298
1299 Search documents which match condition
1300
1301 my $nres = $node->search( $cond, $depth );
1302
1303 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1304 depth for meta search.
1305
1306 Function results C<Search::Estraier::NodeResult> object.
1307
1308 =cut
1309
1310 sub search {
1311 my $self = shift;
1312 my ($cond, $depth) = @_;
1313 return unless ($cond && defined($depth) && $self->{url});
1314 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1315 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1316
1317 my $resbody;
1318
1319 my $rv = $self->shuttle_url( $self->{url} . '/search',
1320 'application/x-www-form-urlencoded',
1321 $self->cond_to_query( $cond, $depth ),
1322 \$resbody,
1323 );
1324 return if ($rv != 200);
1325
1326 my (@docs, $hints);
1327
1328 my @lines = split(/\n/, $resbody);
1329 return unless (@lines);
1330
1331 my $border = $lines[0];
1332 my $isend = 0;
1333 my $lnum = 1;
1334
1335 while ( $lnum <= $#lines ) {
1336 my $line = $lines[$lnum];
1337 $lnum++;
1338
1339 #warn "## $line\n";
1340 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1341 $isend = $1;
1342 last;
1343 }
1344
1345 if ($line =~ /\t/) {
1346 my ($k,$v) = split(/\t/, $line, 2);
1347 $hints->{$k} = $v;
1348 }
1349 }
1350
1351 my $snum = $lnum;
1352
1353 while( ! $isend && $lnum <= $#lines ) {
1354 my $line = $lines[$lnum];
1355 #warn "# $lnum: $line\n";
1356 $lnum++;
1357
1358 if ($line && $line =~ m/^\Q$border\E/) {
1359 if ($lnum > $snum) {
1360 my $rdattrs;
1361 my $rdvector;
1362 my $rdsnippet;
1363
1364 my $rlnum = $snum;
1365 while ($rlnum < $lnum - 1 ) {
1366 #my $rdline = $self->_s($lines[$rlnum]);
1367 my $rdline = $lines[$rlnum];
1368 $rlnum++;
1369 last unless ($rdline);
1370 if ($rdline =~ /^%/) {
1371 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1372 } elsif($rdline =~ /=/) {
1373 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1374 } else {
1375 confess "invalid format of response";
1376 }
1377 }
1378 while($rlnum < $lnum - 1) {
1379 my $rdline = $lines[$rlnum];
1380 $rlnum++;
1381 $rdsnippet .= "$rdline\n";
1382 }
1383 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1384 if (my $rduri = $rdattrs->{'@uri'}) {
1385 push @docs, new Search::Estraier::ResultDocument(
1386 uri => $rduri,
1387 attrs => $rdattrs,
1388 snippet => $rdsnippet,
1389 keywords => $rdvector,
1390 );
1391 }
1392 }
1393 $snum = $lnum;
1394 #warn "### $line\n";
1395 $isend = 1 if ($line =~ /:END$/);
1396 }
1397
1398 }
1399
1400 if (! $isend) {
1401 warn "received result doesn't have :END\n$resbody";
1402 return;
1403 }
1404
1405 #warn Dumper(\@docs, $hints);
1406
1407 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1408 }
1409
1410
1411 =head2 cond_to_query
1412
1413 Return URI encoded string generated from Search::Estraier::Condition
1414
1415 my $args = $node->cond_to_query( $cond, $depth );
1416
1417 =cut
1418
1419 sub cond_to_query {
1420 my $self = shift;
1421
1422 my $cond = shift || return;
1423 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1424 my $depth = shift;
1425
1426 my @args;
1427
1428 if (my $phrase = $cond->phrase) {
1429 push @args, 'phrase=' . uri_escape($phrase);
1430 }
1431
1432 if (my @attrs = $cond->attrs) {
1433 for my $i ( 0 .. $#attrs ) {
1434 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1435 }
1436 }
1437
1438 if (my $order = $cond->order) {
1439 push @args, 'order=' . uri_escape($order);
1440 }
1441
1442 if (my $max = $cond->max) {
1443 push @args, 'max=' . $max;
1444 } else {
1445 push @args, 'max=' . (1 << 30);
1446 }
1447
1448 if (my $options = $cond->options) {
1449 push @args, 'options=' . $options;
1450 }
1451
1452 push @args, 'depth=' . $depth if ($depth);
1453 push @args, 'wwidth=' . $self->{wwidth};
1454 push @args, 'hwidth=' . $self->{hwidth};
1455 push @args, 'awidth=' . $self->{awidth};
1456
1457 return join('&', @args);
1458 }
1459
1460
1461 =head2 shuttle_url
1462
1463 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1464 master.
1465
1466 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1467
1468 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1469 body will be saved within object.
1470
1471 =cut
1472
1473 use LWP::UserAgent;
1474
1475 sub shuttle_url {
1476 my $self = shift;
1477
1478 my ($url, $content_type, $reqbody, $resbody) = @_;
1479
1480 $self->{status} = -1;
1481
1482 warn "## $url\n" if ($self->{debug});
1483
1484 $url = new URI($url);
1485 if (
1486 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1487 !$url->host || !$url->port || $url->port < 1
1488 ) {
1489 carp "can't parse $url\n";
1490 return -1;
1491 }
1492
1493 my $ua = LWP::UserAgent->new;
1494 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1495
1496 my $req;
1497 if ($reqbody) {
1498 $req = HTTP::Request->new(POST => $url);
1499 } else {
1500 $req = HTTP::Request->new(GET => $url);
1501 }
1502
1503 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1504 $req->headers->header( 'Connection', 'close' );
1505 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1506 $req->content_type( $content_type );
1507
1508 warn $req->headers->as_string,"\n" if ($self->{debug});
1509
1510 if ($reqbody) {
1511 warn "$reqbody\n" if ($self->{debug});
1512 $req->content( $reqbody );
1513 }
1514
1515 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1516
1517 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1518
1519 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1520
1521 if (! $res->is_success) {
1522 if ($self->{croak_on_error}) {
1523 croak("can't get $url: ",$res->status_line);
1524 } else {
1525 return -1;
1526 }
1527 }
1528
1529 $$resbody .= $res->content;
1530
1531 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1532
1533 return $self->{status};
1534 }
1535
1536
1537 =head2 set_snippet_width
1538
1539 Set width of snippets in results
1540
1541 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1542
1543 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1544 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1545
1546 C<$hwidth> specified width of strings from beginning of string. Default
1547 value is C<96>. Negative or zero value keep previous value.
1548
1549 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1550 If negative of zero value is provided previous value is kept unchanged.
1551
1552 =cut
1553
1554 sub set_snippet_width {
1555 my $self = shift;
1556
1557 my ($wwidth, $hwidth, $awidth) = @_;
1558 $self->{wwidth} = $wwidth;
1559 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1560 $self->{awidth} = $awidth if ($awidth >= 0);
1561 }
1562
1563
1564 =head2 set_user
1565
1566 Manage users of node
1567
1568 $node->set_user( 'name', $mode );
1569
1570 C<$mode> can be one of:
1571
1572 =over 4
1573
1574 =item 0
1575
1576 delete account
1577
1578 =item 1
1579
1580 set administrative right for user
1581
1582 =item 2
1583
1584 set user account as guest
1585
1586 =back
1587
1588 Return true on success, otherwise false.
1589
1590 =cut
1591
1592 sub set_user {
1593 my $self = shift;
1594 my ($name, $mode) = @_;
1595
1596 return unless ($self->{url});
1597 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1598
1599 $self->shuttle_url( $self->{url} . '/_set_user',
1600 'text/plain',
1601 'name=' . uri_escape($name) . '&mode=' . $mode,
1602 undef
1603 ) == 200;
1604 }
1605
1606
1607 =head2 set_link
1608
1609 Manage node links
1610
1611 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1612
1613 If C<$credit> is negative, link is removed.
1614
1615 =cut
1616
1617 sub set_link {
1618 my $self = shift;
1619 my ($url, $label, $credit) = @_;
1620
1621 return unless ($self->{url});
1622 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1623
1624 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1625 $reqbody .= '&credit=' . $credit if ($credit > 0);
1626
1627 $self->shuttle_url( $self->{url} . '/_set_link',
1628 'application/x-www-form-urlencoded',
1629 $reqbody,
1630 undef
1631 ) == 200;
1632 }
1633
1634
1635 =head1 PRIVATE METHODS
1636
1637 You could call those directly, but you don't have to. I hope.
1638
1639 =head2 _set_info
1640
1641 Set information for node
1642
1643 $node->_set_info;
1644
1645 =cut
1646
1647 sub _set_info {
1648 my $self = shift;
1649
1650 $self->{status} = -1;
1651 return unless ($self->{url});
1652
1653 my $resbody;
1654 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1655 'text/plain',
1656 undef,
1657 \$resbody,
1658 );
1659
1660 return if ($rv != 200 || !$resbody);
1661
1662 # it seems that response can have multiple line endings
1663 $resbody =~ s/[\r\n]+$//;
1664
1665 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1666 split(/\t/, $resbody, 5);
1667
1668 }
1669
1670 ###
1671
1672 =head1 EXPORT
1673
1674 Nothing.
1675
1676 =head1 SEE ALSO
1677
1678 L<http://hyperestraier.sourceforge.net/>
1679
1680 Hyper Estraier Ruby interface on which this module is based.
1681
1682 =head1 AUTHOR
1683
1684 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1685
1686
1687 =head1 COPYRIGHT AND LICENSE
1688
1689 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1690
1691 This library is free software; you can redistribute it and/or modify
1692 it under the GPL v2 or later.
1693
1694 =cut
1695
1696 1;

  ViewVC Help
Powered by ViewVC 1.1.26