/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 60 - (show annotations)
Sat Jan 7 00:00:15 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 27250 byte(s)
added few checks to better handle empty documents, array return is not enforced any more.
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.01';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 use Search::Estraier;
16 my $est = new Search::Estraier();
17
18 =head1 DESCRIPTION
19
20 This module is implementation of node API of Hyper Estraier. Since it's
21 perl-only module with dependencies only on standard perl modules, it will
22 run on all platforms on which perl runs. It doesn't require compilation
23 or Hyper Estraier development files on target machine.
24
25 It is implemented as multiple packages which closly resamble Ruby
26 implementation. It also includes methods to manage nodes.
27
28 =cut
29
30 =head1 Inheritable common methods
31
32 This methods should really move somewhere else.
33
34 =head2 _s
35
36 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37
38 my $text = $self->_s(" this is a text ");
39 $text = 'this is a text';
40
41 =cut
42
43 sub _s {
44 my $text = $_[1] || return;
45 $text =~ s/\s\s+/ /gs;
46 $text =~ s/^\s+//;
47 $text =~ s/\s+$//;
48 return $text;
49 }
50
51 package Search::Estraier::Document;
52
53 use Carp qw/croak confess/;
54
55 use Search::Estraier;
56 our @ISA = qw/Search::Estraier/;
57
58 =head1 Search::Estraier::Document
59
60 This class implements Document which is collection of attributes
61 (key=value), vectors (also key value) display text and hidden text.
62
63
64 =head2 new
65
66 Create new document, empty or from draft.
67
68 my $doc = new Search::HyperEstraier::Document;
69 my $doc2 = new Search::HyperEstraier::Document( $draft );
70
71 =cut
72
73 sub new {
74 my $class = shift;
75 my $self = {};
76 bless($self, $class);
77
78 $self->{id} = -1;
79
80 my $draft = shift;
81
82 if ($draft) {
83 my $in_text = 0;
84 foreach my $line (split(/\n/, $draft)) {
85
86 if ($in_text) {
87 if ($line =~ /^\t/) {
88 push @{ $self->{htexts} }, substr($line, 1);
89 } else {
90 push @{ $self->{dtexts} }, $line;
91 }
92 next;
93 }
94
95 if ($line =~ m/^%VECTOR\t(.+)$/) {
96 my @fields = split(/\t/, $1);
97 for my $i ( 0 .. ($#fields - 1) ) {
98 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99 $i++;
100 }
101 next;
102 } elsif ($line =~ m/^%/) {
103 # What is this? comment?
104 #warn "$line\n";
105 next;
106 } elsif ($line =~ m/^$/) {
107 $in_text = 1;
108 next;
109 } elsif ($line =~ m/^(.+)=(.+)$/) {
110 $self->{attrs}->{ $1 } = $2;
111 next;
112 }
113
114 warn "draft ignored: $line\n";
115 }
116 }
117
118 $self ? return $self : return undef;
119 }
120
121
122 =head2 add_attr
123
124 Add an attribute.
125
126 $doc->add_attr( name => 'value' );
127
128 Delete attribute using
129
130 $doc->add_attr( name => undef );
131
132 =cut
133
134 sub add_attr {
135 my $self = shift;
136 my $attrs = {@_};
137
138 while (my ($name, $value) = each %{ $attrs }) {
139 if (! defined($value)) {
140 delete( $self->{attrs}->{ $self->_s($name) } );
141 } else {
142 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 }
144 }
145
146 return 1;
147 }
148
149
150 =head2 add_text
151
152 Add a sentence of text.
153
154 $doc->add_text('this is example text to display');
155
156 =cut
157
158 sub add_text {
159 my $self = shift;
160 my $text = shift;
161 return unless defined($text);
162
163 push @{ $self->{dtexts} }, $self->_s($text);
164 }
165
166
167 =head2 add_hidden_text
168
169 Add a hidden sentence.
170
171 $doc->add_hidden_text('this is example text just for search');
172
173 =cut
174
175 sub add_hidden_text {
176 my $self = shift;
177 my $text = shift;
178 return unless defined($text);
179
180 push @{ $self->{htexts} }, $self->_s($text);
181 }
182
183
184 =head2 id
185
186 Get the ID number of document. If the object has never been registred, C<-1> is returned.
187
188 print $doc->id;
189
190 =cut
191
192 sub id {
193 my $self = shift;
194 return $self->{id};
195 }
196
197
198 =head2 attr_names
199
200 Returns array with attribute names from document object.
201
202 my @attrs = $doc->attr_names;
203
204 =cut
205
206 sub attr_names {
207 my $self = shift;
208 return unless ($self->{attrs});
209 #croak "attr_names return array, not scalar" if (! wantarray);
210 return sort keys %{ $self->{attrs} };
211 }
212
213
214 =head2 attr
215
216 Returns value of an attribute.
217
218 my $value = $doc->attr( 'attribute' );
219
220 =cut
221
222 sub attr {
223 my $self = shift;
224 my $name = shift;
225 return unless (defined($name) && $self->{attrs});
226 return $self->{attrs}->{ $name };
227 }
228
229
230 =head2 texts
231
232 Returns array with text sentences.
233
234 my @texts = $doc->texts;
235
236 =cut
237
238 sub texts {
239 my $self = shift;
240 #confess "texts return array, not scalar" if (! wantarray);
241 return @{ $self->{dtexts} } if ($self->{dtexts});
242 }
243
244
245 =head2 cat_texts
246
247 Return whole text as single scalar.
248
249 my $text = $doc->cat_texts;
250
251 =cut
252
253 sub cat_texts {
254 my $self = shift;
255 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
256 }
257
258
259 =head2 dump_draft
260
261 Dump draft data from document object.
262
263 print $doc->dump_draft;
264
265 =cut
266
267 sub dump_draft {
268 my $self = shift;
269 my $draft;
270
271 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
272 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
273 }
274
275 if ($self->{kwords}) {
276 $draft .= '%%VECTOR';
277 while (my ($key, $value) = each %{ $self->{kwords} }) {
278 $draft .= "\t$key\t$value";
279 }
280 $draft .= "\n";
281 }
282
283 $draft .= "\n";
284
285 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
286 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
287
288 return $draft;
289 }
290
291
292 =head2 delete
293
294 Empty document object
295
296 $doc->delete;
297
298 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
299 convinience. Document objects which go out of scope will be destroyed
300 automatically.
301
302 =cut
303
304 sub delete {
305 my $self = shift;
306
307 foreach my $data (qw/attrs dtexts stexts kwords/) {
308 delete($self->{$data});
309 }
310
311 $self->{id} = -1;
312
313 return 1;
314 }
315
316
317
318 package Search::Estraier::Condition;
319
320 use Carp qw/confess croak/;
321
322 use Search::Estraier;
323 our @ISA = qw/Search::Estraier/;
324
325 =head1 Search::Estraier::Condition
326
327 =head2 new
328
329 my $cond = new Search::HyperEstraier::Condition;
330
331 =cut
332
333 sub new {
334 my $class = shift;
335 my $self = {};
336 bless($self, $class);
337
338 $self->{max} = -1;
339 $self->{options} = 0;
340
341 $self ? return $self : return undef;
342 }
343
344
345 =head2 set_phrase
346
347 $cond->set_phrase('search phrase');
348
349 =cut
350
351 sub set_phrase {
352 my $self = shift;
353 $self->{phrase} = $self->_s( shift );
354 }
355
356
357 =head2 add_attr
358
359 $cond->add_attr('@URI STRINC /~dpavlin/');
360
361 =cut
362
363 sub add_attr {
364 my $self = shift;
365 my $attr = shift || return;
366 push @{ $self->{attrs} }, $self->_s( $attr );
367 }
368
369
370 =head2 set_order
371
372 $cond->set_order('@mdate NUMD');
373
374 =cut
375
376 sub set_order {
377 my $self = shift;
378 $self->{order} = shift;
379 }
380
381
382 =head2 set_max
383
384 $cond->set_max(42);
385
386 =cut
387
388 sub set_max {
389 my $self = shift;
390 my $max = shift;
391 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
392 $self->{max} = $max;
393 }
394
395
396 =head2 set_options
397
398 $cond->set_options( SURE => 1 );
399
400 =cut
401
402 my $options = {
403 # check N-gram keys skipping by three
404 SURE => 1 << 0,
405 # check N-gram keys skipping by two
406 USUAL => 1 << 1,
407 # without TF-IDF tuning
408 FAST => 1 << 2,
409 # with the simplified phrase
410 AGITO => 1 << 3,
411 # check every N-gram key
412 NOIDF => 1 << 4,
413 # check N-gram keys skipping by one
414 SIMPLE => 1 << 10,
415 };
416
417 sub set_options {
418 my $self = shift;
419 my $option = shift;
420 confess "unknown option" unless ($options->{$option});
421 $self->{options} ||= $options->{$option};
422 }
423
424
425 =head2 phrase
426
427 Return search phrase.
428
429 print $cond->phrase;
430
431 =cut
432
433 sub phrase {
434 my $self = shift;
435 return $self->{phrase};
436 }
437
438
439 =head2 order
440
441 Return search result order.
442
443 print $cond->order;
444
445 =cut
446
447 sub order {
448 my $self = shift;
449 return $self->{order};
450 }
451
452
453 =head2 attrs
454
455 Return search result attrs.
456
457 my @cond_attrs = $cond->attrs;
458
459 =cut
460
461 sub attrs {
462 my $self = shift;
463 #croak "attrs return array, not scalar" if (! wantarray);
464 return @{ $self->{attrs} } if ($self->{attrs});
465 }
466
467
468 =head2 max
469
470 Return maximum number of results.
471
472 print $cond->max;
473
474 C<-1> is returned for unitialized value, C<0> is unlimited.
475
476 =cut
477
478 sub max {
479 my $self = shift;
480 return $self->{max};
481 }
482
483
484 =head2 options
485
486 Return options for this condition.
487
488 print $cond->options;
489
490 Options are returned in numerical form.
491
492 =cut
493
494 sub options {
495 my $self = shift;
496 return $self->{options};
497 }
498
499
500 package Search::Estraier::ResultDocument;
501
502 use Carp qw/croak/;
503
504 #use Search::Estraier;
505 #our @ISA = qw/Search::Estraier/;
506
507 =head1 Search::Estraier::ResultDocument
508
509 =head2 new
510
511 my $rdoc = new Search::HyperEstraier::ResultDocument(
512 uri => 'http://localhost/document/uri/42',
513 attrs => {
514 foo => 1,
515 bar => 2,
516 },
517 snippet => 'this is a text of snippet'
518 keywords => 'this\tare\tkeywords'
519 );
520
521 =cut
522
523 sub new {
524 my $class = shift;
525 my $self = {@_};
526 bless($self, $class);
527
528 foreach my $f (qw/uri attrs snippet keywords/) {
529 croak "missing $f for ResultDocument" unless defined($self->{$f});
530 }
531
532 $self ? return $self : return undef;
533 }
534
535
536 =head2 uri
537
538 Return URI of result document
539
540 print $rdoc->uri;
541
542 =cut
543
544 sub uri {
545 my $self = shift;
546 return $self->{uri};
547 }
548
549
550 =head2 attr_names
551
552 Returns array with attribute names from result document object.
553
554 my @attrs = $rdoc->attr_names;
555
556 =cut
557
558 sub attr_names {
559 my $self = shift;
560 croak "attr_names return array, not scalar" if (! wantarray);
561 return sort keys %{ $self->{attrs} };
562 }
563
564
565 =head2 attr
566
567 Returns value of an attribute.
568
569 my $value = $rdoc->attr( 'attribute' );
570
571 =cut
572
573 sub attr {
574 my $self = shift;
575 my $name = shift || return;
576 return $self->{attrs}->{ $name };
577 }
578
579
580 =head2 snippet
581
582 Return snippet from result document
583
584 print $rdoc->snippet;
585
586 =cut
587
588 sub snippet {
589 my $self = shift;
590 return $self->{snippet};
591 }
592
593
594 =head2 keywords
595
596 Return keywords from result document
597
598 print $rdoc->keywords;
599
600 =cut
601
602 sub keywords {
603 my $self = shift;
604 return $self->{keywords};
605 }
606
607
608 package Search::Estraier::NodeResult;
609
610 use Carp qw/croak/;
611
612 #use Search::Estraier;
613 #our @ISA = qw/Search::Estraier/;
614
615 =head1 Search::Estraier::NodeResult
616
617 =head2 new
618
619 my $res = new Search::HyperEstraier::NodeResult(
620 docs => @array_of_rdocs,
621 hits => %hash_with_hints,
622 );
623
624 =cut
625
626 sub new {
627 my $class = shift;
628 my $self = {@_};
629 bless($self, $class);
630
631 foreach my $f (qw/docs hints/) {
632 croak "missing $f for ResultDocument" unless defined($self->{$f});
633 }
634
635 $self ? return $self : return undef;
636 }
637
638
639 =head2 doc_num
640
641 Return number of documents
642
643 print $res->doc_num;
644
645 =cut
646
647 sub doc_num {
648 my $self = shift;
649 return $#{$self->{docs}} + 1;
650 }
651
652
653 =head2 get_doc
654
655 Return single document
656
657 my $doc = $res->get_doc( 42 );
658
659 Returns undef if document doesn't exist.
660
661 =cut
662
663 sub get_doc {
664 my $self = shift;
665 my $num = shift;
666 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
667 return undef if ($num < 0 || $num > $self->{docs});
668 return $self->{docs}->[$num];
669 }
670
671
672 =head2 hint
673
674 Return specific hint from results.
675
676 print $rec->hint( 'VERSION' );
677
678 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
679 C<TIME>, C<LINK#n>, C<VIEW>.
680
681 =cut
682
683 sub hint {
684 my $self = shift;
685 my $key = shift || return;
686 return $self->{hints}->{$key};
687 }
688
689
690 package Search::Estraier::Node;
691
692 use Carp qw/carp croak confess/;
693 use URI;
694 use MIME::Base64;
695 use IO::Socket::INET;
696 use URI::Escape qw/uri_escape/;
697
698 =head1 Search::Estraier::Node
699
700 =head2 new
701
702 my $node = new Search::HyperEstraier::Node;
703
704 =cut
705
706 sub new {
707 my $class = shift;
708 my $self = {
709 pxport => -1,
710 timeout => 0, # this used to be -1
711 dnum => -1,
712 wnum => -1,
713 size => -1.0,
714 wwidth => 480,
715 hwidth => 96,
716 awidth => 96,
717 status => -1,
718 };
719 bless($self, $class);
720
721 my $args = {@_};
722
723 $self->{debug} = $args->{debug};
724 warn "## Node debug on\n" if ($self->{debug});
725
726 $self ? return $self : return undef;
727 }
728
729
730 =head2 set_url
731
732 Specify URL to node server
733
734 $node->set_url('http://localhost:1978');
735
736 =cut
737
738 sub set_url {
739 my $self = shift;
740 $self->{url} = shift;
741 }
742
743
744 =head2 set_proxy
745
746 Specify proxy server to connect to node server
747
748 $node->set_proxy('proxy.example.com', 8080);
749
750 =cut
751
752 sub set_proxy {
753 my $self = shift;
754 my ($host,$port) = @_;
755 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
756 $self->{pxhost} = $host;
757 $self->{pxport} = $port;
758 }
759
760
761 =head2 set_timeout
762
763 Specify timeout of connection in seconds
764
765 $node->set_timeout( 15 );
766
767 =cut
768
769 sub set_timeout {
770 my $self = shift;
771 my $sec = shift;
772 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
773 $self->{timeout} = $sec;
774 }
775
776
777 =head2 set_auth
778
779 Specify name and password for authentication to node server.
780
781 $node->set_auth('clint','eastwood');
782
783 =cut
784
785 sub set_auth {
786 my $self = shift;
787 my ($login,$passwd) = @_;
788 my $basic_auth = encode_base64( "$login:$passwd" );
789 chomp($basic_auth);
790 $self->{auth} = $basic_auth;
791 }
792
793
794 =head2 status
795
796 Return status code of last request.
797
798 print $node->status;
799
800 C<-1> means connection failure.
801
802 =cut
803
804 sub status {
805 my $self = shift;
806 return $self->{status};
807 }
808
809
810 =head2 put_doc
811
812 Add a document
813
814 $node->put_doc( $document_draft ) or die "can't add document";
815
816 Return true on success or false on failture.
817
818 =cut
819
820 sub put_doc {
821 my $self = shift;
822 my $doc = shift || return;
823 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
824 $self->shuttle_url( $self->{url} . '/put_doc',
825 'text/x-estraier-draft',
826 $doc->dump_draft,
827 undef
828 ) == 200;
829 }
830
831
832 =head2 out_doc
833
834 Remove a document
835
836 $node->out_doc( document_id ) or "can't remove document";
837
838 Return true on success or false on failture.
839
840 =cut
841
842 sub out_doc {
843 my $self = shift;
844 my $id = shift || return;
845 return unless ($self->{url});
846 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
847 $self->shuttle_url( $self->{url} . '/out_doc',
848 'application/x-www-form-urlencoded',
849 "id=$id",
850 undef
851 ) == 200;
852 }
853
854
855 =head2 out_doc_by_uri
856
857 Remove a registrated document using it's uri
858
859 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
860
861 Return true on success or false on failture.
862
863 =cut
864
865 sub out_doc_by_uri {
866 my $self = shift;
867 my $uri = shift || return;
868 return unless ($self->{url});
869 $self->shuttle_url( $self->{url} . '/out_doc',
870 'application/x-www-form-urlencoded',
871 "uri=" . uri_escape($uri),
872 undef
873 ) == 200;
874 }
875
876
877 =head2 edit_doc
878
879 Edit attributes of a document
880
881 $node->edit_doc( $document_draft ) or die "can't edit document";
882
883 Return true on success or false on failture.
884
885 =cut
886
887 sub edit_doc {
888 my $self = shift;
889 my $doc = shift || return;
890 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
891 $self->shuttle_url( $self->{url} . '/edit_doc',
892 'text/x-estraier-draft',
893 $doc->dump_draft,
894 undef
895 ) == 200;
896 }
897
898
899 =head2 get_doc
900
901 Retreive document
902
903 my $doc = $node->get_doc( document_id ) or die "can't get document";
904
905 Return true on success or false on failture.
906
907 =cut
908
909 sub get_doc {
910 my $self = shift;
911 my $id = shift || return;
912 return $self->_fetch_doc( id => $id );
913 }
914
915
916 =head2 get_doc_by_uri
917
918 Retreive document
919
920 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
921
922 Return true on success or false on failture.
923
924 =cut
925
926 sub get_doc_by_uri {
927 my $self = shift;
928 my $uri = shift || return;
929 return $self->_fetch_doc( uri => $uri );
930 }
931
932
933 =head2 get_doc_attr
934
935 Retrieve the value of an atribute from object
936
937 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
938 die "can't get document attribute";
939
940 =cut
941
942 sub get_doc_attr {
943 my $self = shift;
944 my ($id,$name) = @_;
945 return unless ($id && $name);
946 return $self->_fetch_doc( id => $id, attr => $name );
947 }
948
949
950 =head2 get_doc_attr_by_uri
951
952 Retrieve the value of an atribute from object
953
954 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
955 die "can't get document attribute";
956
957 =cut
958
959 sub get_doc_attr_by_uri {
960 my $self = shift;
961 my ($uri,$name) = @_;
962 return unless ($uri && $name);
963 return $self->_fetch_doc( uri => $uri, attr => $name );
964 }
965
966
967 =head2 etch_doc
968
969 Exctract document keywords
970
971 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
972
973 =cut
974
975 sub etch_doc {
976 my $self = shift;
977 my $id = shift || return;
978 return $self->_fetch_doc( id => $id, etch => 1 );
979 }
980
981 =head2 etch_doc_by_uri
982
983 Retreive document
984
985 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
986
987 Return true on success or false on failture.
988
989 =cut
990
991 sub etch_doc_by_uri {
992 my $self = shift;
993 my $uri = shift || return;
994 return $self->_fetch_doc( uri => $uri, etch => 1 );
995 }
996
997
998 =head2 uri_to_id
999
1000 Get ID of document specified by URI
1001
1002 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1003
1004 =cut
1005
1006 sub uri_to_id {
1007 my $self = shift;
1008 my $uri = shift || return;
1009 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1010 }
1011
1012
1013 =head2 _fetch_doc
1014
1015 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1016 C<etch_doc>, C<etch_doc_by_uri>.
1017
1018 # this will decode received draft into Search::Estraier::Document object
1019 my $doc = $node->_fetch_doc( id => 42 );
1020 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1021
1022 # to extract keywords, add etch
1023 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1024 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1025
1026 # to get document attrubute add attr
1027 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1028 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1029
1030 # more general form which allows implementation of
1031 # uri_to_id
1032 my $id = $node->_fetch_doc(
1033 uri => 'file:///document/uri/42',
1034 path => '/uri_to_id',
1035 chomp_resbody => 1
1036 );
1037
1038 =cut
1039
1040 sub _fetch_doc {
1041 my $self = shift;
1042 my $a = {@_};
1043 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1044
1045 my ($arg, $resbody);
1046
1047 my $path = $a->{path} || '/get_doc';
1048 $path = '/etch_doc' if ($a->{etch});
1049
1050 if ($a->{id}) {
1051 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1052 $arg = 'id=' . $a->{id};
1053 } elsif ($a->{uri}) {
1054 $arg = 'uri=' . uri_escape($a->{uri});
1055 } else {
1056 confess "unhandled argument. Need id or uri.";
1057 }
1058
1059 if ($a->{attr}) {
1060 $path = '/get_doc_attr';
1061 $arg .= '&attr=' . uri_escape($a->{attr});
1062 $a->{chomp_resbody} = 1;
1063 }
1064
1065 my $rv = $self->shuttle_url( $self->{url} . $path,
1066 'application/x-www-form-urlencoded',
1067 $arg,
1068 \$resbody,
1069 );
1070
1071 return if ($rv != 200);
1072
1073 if ($a->{etch}) {
1074 $self->{kwords} = {};
1075 return +{} unless ($resbody);
1076 foreach my $l (split(/\n/, $resbody)) {
1077 my ($k,$v) = split(/\t/, $l, 2);
1078 $self->{kwords}->{$k} = $v if ($v);
1079 }
1080 return $self->{kwords};
1081 } elsif ($a->{chomp_resbody}) {
1082 return unless (defined($resbody));
1083 chomp($resbody);
1084 return $resbody;
1085 } else {
1086 return new Search::Estraier::Document($resbody);
1087 }
1088 }
1089
1090
1091 =head2 name
1092
1093 my $node_name = $node->name;
1094
1095 =cut
1096
1097 sub name {
1098 my $self = shift;
1099 $self->_set_info unless ($self->{name});
1100 return $self->{name};
1101 }
1102
1103
1104 =head2 label
1105
1106 my $node_label = $node->label;
1107
1108 =cut
1109
1110 sub label {
1111 my $self = shift;
1112 $self->_set_info unless ($self->{label});
1113 return $self->{label};
1114 }
1115
1116
1117 =head2 doc_num
1118
1119 my $documents_in_node = $node->doc_num;
1120
1121 =cut
1122
1123 sub doc_num {
1124 my $self = shift;
1125 $self->_set_info if ($self->{dnum} < 0);
1126 return $self->{dnum};
1127 }
1128
1129
1130 =head2 word_num
1131
1132 my $words_in_node = $node->word_num;
1133
1134 =cut
1135
1136 sub word_num {
1137 my $self = shift;
1138 $self->_set_info if ($self->{wnum} < 0);
1139 return $self->{wnum};
1140 }
1141
1142
1143 =head2 size
1144
1145 my $node_size = $node->size;
1146
1147 =cut
1148
1149 sub size {
1150 my $self = shift;
1151 $self->_set_info if ($self->{size} < 0);
1152 return $self->{size};
1153 }
1154
1155
1156 =head2 search
1157
1158 Search documents which match condition
1159
1160 my $nres = $node->search( $cond, $depth );
1161
1162 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1163 depth for meta search.
1164
1165 Function results C<Search::Estraier::NodeResult> object.
1166
1167 =cut
1168
1169 sub search {
1170 my $self = shift;
1171 my ($cond, $depth) = @_;
1172 return unless ($cond && defined($depth) && $self->{url});
1173 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1174 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1175
1176 my $resbody;
1177
1178 my $rv = $self->shuttle_url( $self->{url} . '/search',
1179 'application/x-www-form-urlencoded',
1180 $self->cond_to_query( $cond ),
1181 \$resbody,
1182 );
1183 return if ($rv != 200);
1184
1185 my (@docs, $hints);
1186
1187 my @lines = split(/\n/, $resbody);
1188 return unless (@lines);
1189
1190 my $border = $lines[0];
1191 my $isend = 0;
1192 my $lnum = 1;
1193
1194 while ( $lnum <= $#lines ) {
1195 my $line = $lines[$lnum];
1196 $lnum++;
1197
1198 #warn "## $line\n";
1199 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1200 $isend = $1;
1201 last;
1202 }
1203
1204 if ($line =~ /\t/) {
1205 my ($k,$v) = split(/\t/, $line, 2);
1206 $hints->{$k} = $v;
1207 }
1208 }
1209
1210 my $snum = $lnum;
1211
1212 while( ! $isend && $lnum <= $#lines ) {
1213 my $line = $lines[$lnum];
1214 #warn "# $lnum: $line\n";
1215 $lnum++;
1216
1217 if ($line && $line =~ m/^\Q$border\E/) {
1218 if ($lnum > $snum) {
1219 my $rdattrs;
1220 my $rdvector;
1221 my $rdsnippet;
1222
1223 my $rlnum = $snum;
1224 while ($rlnum < $lnum - 1 ) {
1225 #my $rdline = $self->_s($lines[$rlnum]);
1226 my $rdline = $lines[$rlnum];
1227 $rlnum++;
1228 last unless ($rdline);
1229 if ($rdline =~ /^%/) {
1230 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1231 } elsif($rdline =~ /=/) {
1232 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1233 } else {
1234 confess "invalid format of response";
1235 }
1236 }
1237 while($rlnum < $lnum - 1) {
1238 my $rdline = $lines[$rlnum];
1239 $rlnum++;
1240 $rdsnippet .= "$rdline\n";
1241 }
1242 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1243 if (my $rduri = $rdattrs->{'@uri'}) {
1244 push @docs, new Search::Estraier::ResultDocument(
1245 uri => $rduri,
1246 attrs => $rdattrs,
1247 snippet => $rdsnippet,
1248 keywords => $rdvector,
1249 );
1250 }
1251 }
1252 $snum = $lnum;
1253 #warn "### $line\n";
1254 $isend = 1 if ($line =~ /:END$/);
1255 }
1256
1257 }
1258
1259 if (! $isend) {
1260 warn "received result doesn't have :END\n$resbody";
1261 return;
1262 }
1263
1264 #warn Dumper(\@docs, $hints);
1265
1266 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1267 }
1268
1269
1270 =head2 cond_to_query
1271
1272 Return URI encoded string generated from Search::Estraier::Condition
1273
1274 my $args = $node->cond_to_query( $cond );
1275
1276 =cut
1277
1278 sub cond_to_query {
1279 my $self = shift;
1280
1281 my $cond = shift || return;
1282 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1283
1284 my @args;
1285
1286 if (my $phrase = $cond->phrase) {
1287 push @args, 'phrase=' . uri_escape($phrase);
1288 }
1289
1290 if (my @attrs = $cond->attrs) {
1291 for my $i ( 0 .. $#attrs ) {
1292 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );
1293 }
1294 }
1295
1296 if (my $order = $cond->order) {
1297 push @args, 'order=' . uri_escape($order);
1298 }
1299
1300 if (my $max = $cond->max) {
1301 push @args, 'max=' . $max;
1302 } else {
1303 push @args, 'max=' . (1 << 30);
1304 }
1305
1306 if (my $options = $cond->options) {
1307 push @args, 'options=' . $options;
1308 }
1309
1310 push @args, 'depth=' . $self->{depth} if ($self->{depth});
1311 push @args, 'wwidth=' . $self->{wwidth};
1312 push @args, 'hwidth=' . $self->{hwidth};
1313 push @args, 'awidth=' . $self->{awidth};
1314
1315 return join('&', @args);
1316 }
1317
1318
1319 =head2 shuttle_url
1320
1321 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1322 master.
1323
1324 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1325
1326 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1327 body will be saved within object.
1328
1329 =cut
1330
1331 use LWP::UserAgent;
1332
1333 sub shuttle_url {
1334 my $self = shift;
1335
1336 my ($url, $content_type, $reqbody, $resbody) = @_;
1337
1338 $self->{status} = -1;
1339
1340 warn "## $url\n" if ($self->{debug});
1341
1342 $url = new URI($url);
1343 if (
1344 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1345 !$url->host || !$url->port || $url->port < 1
1346 ) {
1347 carp "can't parse $url\n";
1348 return -1;
1349 }
1350
1351 my $ua = LWP::UserAgent->new;
1352 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1353
1354 my $req;
1355 if ($reqbody) {
1356 $req = HTTP::Request->new(POST => $url);
1357 } else {
1358 $req = HTTP::Request->new(GET => $url);
1359 }
1360
1361 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1362 $req->headers->header( 'Connection', 'close' );
1363 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} );
1364 $req->content_type( $content_type );
1365
1366 warn $req->headers->as_string,"\n" if ($self->{debug});
1367
1368 if ($reqbody) {
1369 warn "$reqbody\n" if ($self->{debug});
1370 $req->content( $reqbody );
1371 }
1372
1373 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1374
1375 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1376
1377 return -1 if (! $res->is_success);
1378
1379 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1380
1381 $$resbody .= $res->content;
1382
1383 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1384
1385 return $self->{status};
1386 }
1387
1388
1389 =head2 set_snippet_width
1390
1391 Set width of snippets in results
1392
1393 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1394
1395 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1396 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1397
1398 C<$hwidth> specified width of strings from beginning of string. Default
1399 value is C<96>. Negative or zero value keep previous value.
1400
1401 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1402 If negative of zero value is provided previous value is kept unchanged.
1403
1404 =cut
1405
1406 sub set_snippet_width {
1407 my $self = shift;
1408
1409 my ($wwidth, $hwidth, $awidth) = @_;
1410 $self->{wwidth} = $wwidth;
1411 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1412 $self->{awidth} = $awidth if ($awidth >= 0);
1413 }
1414
1415
1416 =head2 set_user
1417
1418 Manage users of node
1419
1420 $node->set_user( 'name', $mode );
1421
1422 C<$mode> can be one of:
1423
1424 =over 4
1425
1426 =item 0
1427
1428 delete account
1429
1430 =item 1
1431
1432 set administrative right for user
1433
1434 =item 2
1435
1436 set user account as guest
1437
1438 =back
1439
1440 Return true on success, otherwise false.
1441
1442 =cut
1443
1444 sub set_user {
1445 my $self = shift;
1446 my ($name, $mode) = @_;
1447
1448 return unless ($self->{url});
1449 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1450
1451 $self->shuttle_url( $self->{url} . '/_set_user',
1452 'text/plain',
1453 'name=' . uri_escape($name) . '&mode=' . $mode,
1454 undef
1455 ) == 200;
1456 }
1457
1458
1459 =head2 set_link
1460
1461 Manage node links
1462
1463 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1464
1465 If C<$credit> is negative, link is removed.
1466
1467 =cut
1468
1469 sub set_link {
1470 my $self = shift;
1471 my ($url, $label, $credit) = @_;
1472
1473 return unless ($self->{url});
1474 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1475
1476 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1477 $reqbody .= '&credit=' . $credit if ($credit > 0);
1478
1479 $self->shuttle_url( $self->{url} . '/_set_link',
1480 'text/plain',
1481 $reqbody,
1482 undef
1483 ) == 200;
1484 }
1485
1486
1487 =head1 PRIVATE METHODS
1488
1489 You could call those directly, but you don't have to. I hope.
1490
1491 =head2 _set_info
1492
1493 Set information for node
1494
1495 $node->_set_info;
1496
1497 =cut
1498
1499 sub _set_info {
1500 my $self = shift;
1501
1502 $self->{status} = -1;
1503 return unless ($self->{url});
1504
1505 my $resbody;
1506 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1507 'text/plain',
1508 undef,
1509 \$resbody,
1510 );
1511
1512 return if ($rv != 200 || !$resbody);
1513
1514 # it seems that response can have multiple line endings
1515 $resbody =~ s/[\r\n]+$//;
1516
1517 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1518 split(/\t/, $resbody, 5);
1519
1520 }
1521
1522 ###
1523
1524 =head1 EXPORT
1525
1526 Nothing.
1527
1528 =head1 SEE ALSO
1529
1530 L<http://hyperestraier.sourceforge.net/>
1531
1532 Hyper Estraier Ruby interface on which this module is based.
1533
1534 =head1 AUTHOR
1535
1536 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1537
1538
1539 =head1 COPYRIGHT AND LICENSE
1540
1541 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1542
1543 This library is free software; you can redistribute it and/or modify
1544 it under the GPL v2 or later.
1545
1546 =cut
1547
1548 1;

  ViewVC Help
Powered by ViewVC 1.1.26