/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 52 - (show annotations)
Fri Jan 6 14:10:29 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 25693 byte(s)
search which works
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 use Search::Estraier;
16 my $est = new Search::Estraier();
17
18 =head1 DESCRIPTION
19
20 This module is implementation of node API of Hyper Estraier. Since it's
21 perl-only module with dependencies only on standard perl modules, it will
22 run on all platforms on which perl runs. It doesn't require compilation
23 or Hyper Estraier development files on target machine.
24
25 It is implemented as multiple packages which closly resamble Ruby
26 implementation. It also includes methods to manage nodes.
27
28 =cut
29
30 =head1 Inheritable common methods
31
32 This methods should really move somewhere else.
33
34 =head2 _s
35
36 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37
38 my $text = $self->_s(" this is a text ");
39 $text = 'this is a text';
40
41 =cut
42
43 sub _s {
44 my $text = $_[1] || return;
45 $text =~ s/\s\s+/ /gs;
46 $text =~ s/^\s+//;
47 $text =~ s/\s+$//;
48 return $text;
49 }
50
51 package Search::Estraier::Document;
52
53 use Carp qw/croak confess/;
54
55 use Search::Estraier;
56 our @ISA = qw/Search::Estraier/;
57
58 =head1 Search::Estraier::Document
59
60 This class implements Document which is collection of attributes
61 (key=value), vectors (also key value) display text and hidden text.
62
63
64 =head2 new
65
66 Create new document, empty or from draft.
67
68 my $doc = new Search::HyperEstraier::Document;
69 my $doc2 = new Search::HyperEstraier::Document( $draft );
70
71 =cut
72
73 sub new {
74 my $class = shift;
75 my $self = {};
76 bless($self, $class);
77
78 $self->{id} = -1;
79
80 my $draft = shift;
81
82 if ($draft) {
83 my $in_text = 0;
84 foreach my $line (split(/\n/, $draft)) {
85
86 if ($in_text) {
87 if ($line =~ /^\t/) {
88 push @{ $self->{htexts} }, substr($line, 1);
89 } else {
90 push @{ $self->{dtexts} }, $line;
91 }
92 next;
93 }
94
95 if ($line =~ m/^%VECTOR\t(.+)$/) {
96 my @fields = split(/\t/, $1);
97 for my $i ( 0 .. ($#fields - 1) ) {
98 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99 $i++;
100 }
101 next;
102 } elsif ($line =~ m/^%/) {
103 # What is this? comment?
104 #warn "$line\n";
105 next;
106 } elsif ($line =~ m/^$/) {
107 $in_text = 1;
108 next;
109 } elsif ($line =~ m/^(.+)=(.+)$/) {
110 $self->{attrs}->{ $1 } = $2;
111 next;
112 }
113
114 warn "draft ignored: $line\n";
115 }
116 }
117
118 $self ? return $self : return undef;
119 }
120
121
122 =head2 add_attr
123
124 Add an attribute.
125
126 $doc->add_attr( name => 'value' );
127
128 Delete attribute using
129
130 $doc->add_attr( name => undef );
131
132 =cut
133
134 sub add_attr {
135 my $self = shift;
136 my $attrs = {@_};
137
138 while (my ($name, $value) = each %{ $attrs }) {
139 if (! defined($value)) {
140 delete( $self->{attrs}->{ $self->_s($name) } );
141 } else {
142 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 }
144 }
145
146 return 1;
147 }
148
149
150 =head2 add_text
151
152 Add a sentence of text.
153
154 $doc->add_text('this is example text to display');
155
156 =cut
157
158 sub add_text {
159 my $self = shift;
160 my $text = shift;
161 return unless defined($text);
162
163 push @{ $self->{dtexts} }, $self->_s($text);
164 }
165
166
167 =head2 add_hidden_text
168
169 Add a hidden sentence.
170
171 $doc->add_hidden_text('this is example text just for search');
172
173 =cut
174
175 sub add_hidden_text {
176 my $self = shift;
177 my $text = shift;
178 return unless defined($text);
179
180 push @{ $self->{htexts} }, $self->_s($text);
181 }
182
183
184 =head2 id
185
186 Get the ID number of document. If the object has never been registred, C<-1> is returned.
187
188 print $doc->id;
189
190 =cut
191
192 sub id {
193 my $self = shift;
194 return $self->{id};
195 }
196
197
198 =head2 attr_names
199
200 Returns array with attribute names from document object.
201
202 my @attrs = $doc->attr_names;
203
204 =cut
205
206 sub attr_names {
207 my $self = shift;
208 croak "attr_names return array, not scalar" if (! wantarray);
209 return sort keys %{ $self->{attrs} };
210 }
211
212
213 =head2 attr
214
215 Returns value of an attribute.
216
217 my $value = $doc->attr( 'attribute' );
218
219 =cut
220
221 sub attr {
222 my $self = shift;
223 my $name = shift;
224
225 return $self->{'attrs'}->{ $name };
226 }
227
228
229 =head2 texts
230
231 Returns array with text sentences.
232
233 my @texts = $doc->texts;
234
235 =cut
236
237 sub texts {
238 my $self = shift;
239 confess "texts return array, not scalar" if (! wantarray);
240 return @{ $self->{dtexts} };
241 }
242
243
244 =head2 cat_texts
245
246 Return whole text as single scalar.
247
248 my $text = $doc->cat_texts;
249
250 =cut
251
252 sub cat_texts {
253 my $self = shift;
254 return join(' ',@{ $self->{dtexts} });
255 }
256
257
258 =head2 dump_draft
259
260 Dump draft data from document object.
261
262 print $doc->dump_draft;
263
264 =cut
265
266 sub dump_draft {
267 my $self = shift;
268 my $draft;
269
270 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
271 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
272 }
273
274 if ($self->{kwords}) {
275 $draft .= '%%VECTOR';
276 while (my ($key, $value) = each %{ $self->{kwords} }) {
277 $draft .= "\t$key\t$value";
278 }
279 $draft .= "\n";
280 }
281
282 $draft .= "\n";
283
284 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
285 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
286
287 return $draft;
288 }
289
290
291 =head2 delete
292
293 Empty document object
294
295 $doc->delete;
296
297 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
298 convinience. Document objects which go out of scope will be destroyed
299 automatically.
300
301 =cut
302
303 sub delete {
304 my $self = shift;
305
306 foreach my $data (qw/attrs dtexts stexts kwords/) {
307 delete($self->{$data});
308 }
309
310 $self->{id} = -1;
311
312 return 1;
313 }
314
315
316
317 package Search::Estraier::Condition;
318
319 use Carp qw/confess croak/;
320
321 use Search::Estraier;
322 our @ISA = qw/Search::Estraier/;
323
324 =head1 Search::Estraier::Condition
325
326 =head2 new
327
328 my $cond = new Search::HyperEstraier::Condition;
329
330 =cut
331
332 sub new {
333 my $class = shift;
334 my $self = {};
335 bless($self, $class);
336
337 $self->{max} = -1;
338 $self->{options} = 0;
339
340 $self ? return $self : return undef;
341 }
342
343
344 =head2 set_phrase
345
346 $cond->set_phrase('search phrase');
347
348 =cut
349
350 sub set_phrase {
351 my $self = shift;
352 $self->{phrase} = $self->_s( shift );
353 }
354
355
356 =head2 add_attr
357
358 $cond->add_attr('@URI STRINC /~dpavlin/');
359
360 =cut
361
362 sub add_attr {
363 my $self = shift;
364 my $attr = shift || return;
365 push @{ $self->{attrs} }, $self->_s( $attr );
366 }
367
368
369 =head2 set_order
370
371 $cond->set_order('@mdate NUMD');
372
373 =cut
374
375 sub set_order {
376 my $self = shift;
377 $self->{order} = shift;
378 }
379
380
381 =head2 set_max
382
383 $cond->set_max(42);
384
385 =cut
386
387 sub set_max {
388 my $self = shift;
389 my $max = shift;
390 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
391 $self->{max} = $max;
392 }
393
394
395 =head2 set_options
396
397 $cond->set_options( SURE => 1 );
398
399 =cut
400
401 my $options = {
402 # check N-gram keys skipping by three
403 SURE => 1 << 0,
404 # check N-gram keys skipping by two
405 USUAL => 1 << 1,
406 # without TF-IDF tuning
407 FAST => 1 << 2,
408 # with the simplified phrase
409 AGITO => 1 << 3,
410 # check every N-gram key
411 NOIDF => 1 << 4,
412 # check N-gram keys skipping by one
413 SIMPLE => 1 << 10,
414 };
415
416 sub set_options {
417 my $self = shift;
418 my $option = shift;
419 confess "unknown option" unless ($options->{$option});
420 $self->{options} ||= $options->{$option};
421 }
422
423
424 =head2 phrase
425
426 Return search phrase.
427
428 print $cond->phrase;
429
430 =cut
431
432 sub phrase {
433 my $self = shift;
434 return $self->{phrase};
435 }
436
437
438 =head2 order
439
440 Return search result order.
441
442 print $cond->order;
443
444 =cut
445
446 sub order {
447 my $self = shift;
448 return $self->{order};
449 }
450
451
452 =head2 attrs
453
454 Return search result attrs.
455
456 my @cond_attrs = $cond->attrs;
457
458 =cut
459
460 sub attrs {
461 my $self = shift;
462 #croak "attrs return array, not scalar" if (! wantarray);
463 return @{ $self->{attrs} };
464 }
465
466
467 =head2 max
468
469 Return maximum number of results.
470
471 print $cond->max;
472
473 C<-1> is returned for unitialized value, C<0> is unlimited.
474
475 =cut
476
477 sub max {
478 my $self = shift;
479 return $self->{max};
480 }
481
482
483 =head2 options
484
485 Return options for this condition.
486
487 print $cond->options;
488
489 Options are returned in numerical form.
490
491 =cut
492
493 sub options {
494 my $self = shift;
495 return $self->{options};
496 }
497
498
499 package Search::Estraier::ResultDocument;
500
501 use Carp qw/croak/;
502
503 #use Search::Estraier;
504 #our @ISA = qw/Search::Estraier/;
505
506 =head1 Search::Estraier::ResultDocument
507
508 =head2 new
509
510 my $rdoc = new Search::HyperEstraier::ResultDocument(
511 uri => 'http://localhost/document/uri/42',
512 attrs => {
513 foo => 1,
514 bar => 2,
515 },
516 snippet => 'this is a text of snippet'
517 keywords => 'this\tare\tkeywords'
518 );
519
520 =cut
521
522 sub new {
523 my $class = shift;
524 my $self = {@_};
525 bless($self, $class);
526
527 foreach my $f (qw/uri attrs snippet keywords/) {
528 croak "missing $f for ResultDocument" unless defined($self->{$f});
529 }
530
531 $self ? return $self : return undef;
532 }
533
534
535 =head2 uri
536
537 Return URI of result document
538
539 print $rdoc->uri;
540
541 =cut
542
543 sub uri {
544 my $self = shift;
545 return $self->{uri};
546 }
547
548
549 =head2 attr_names
550
551 Returns array with attribute names from result document object.
552
553 my @attrs = $rdoc->attr_names;
554
555 =cut
556
557 sub attr_names {
558 my $self = shift;
559 croak "attr_names return array, not scalar" if (! wantarray);
560 return sort keys %{ $self->{attrs} };
561 }
562
563
564 =head2 attr
565
566 Returns value of an attribute.
567
568 my $value = $rdoc->attr( 'attribute' );
569
570 =cut
571
572 sub attr {
573 my $self = shift;
574 my $name = shift || return;
575 return $self->{attrs}->{ $name };
576 }
577
578
579 =head2 snippet
580
581 Return snippet from result document
582
583 print $rdoc->snippet;
584
585 =cut
586
587 sub snippet {
588 my $self = shift;
589 return $self->{snippet};
590 }
591
592
593 =head2 keywords
594
595 Return keywords from result document
596
597 print $rdoc->keywords;
598
599 =cut
600
601 sub keywords {
602 my $self = shift;
603 return $self->{keywords};
604 }
605
606
607 package Search::Estraier::NodeResult;
608
609 use Carp qw/croak/;
610
611 #use Search::Estraier;
612 #our @ISA = qw/Search::Estraier/;
613
614 =head1 Search::Estraier::NodeResult
615
616 =head2 new
617
618 my $res = new Search::HyperEstraier::NodeResult(
619 docs => @array_of_rdocs,
620 hits => %hash_with_hints,
621 );
622
623 =cut
624
625 sub new {
626 my $class = shift;
627 my $self = {@_};
628 bless($self, $class);
629
630 foreach my $f (qw/docs hints/) {
631 croak "missing $f for ResultDocument" unless defined($self->{$f});
632 }
633
634 $self ? return $self : return undef;
635 }
636
637
638 =head2 doc_num
639
640 Return number of documents
641
642 print $res->doc_num;
643
644 =cut
645
646 sub doc_num {
647 my $self = shift;
648 return $#{$self->{docs}};
649 }
650
651
652 =head2 get_doc
653
654 Return single document
655
656 my $doc = $res->get_doc( 42 );
657
658 Returns undef if document doesn't exist.
659
660 =cut
661
662 sub get_doc {
663 my $self = shift;
664 my $num = shift;
665 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
666 return undef if ($num < 0 || $num > $self->{docs});
667 return $self->{docs}->[$num];
668 }
669
670
671 =head2 hint
672
673 Return specific hint from results.
674
675 print $rec->hint( 'VERSION' );
676
677 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
678 C<TIME>, C<LINK#n>, C<VIEW>.
679
680 =cut
681
682 sub hint {
683 my $self = shift;
684 my $key = shift || return;
685 return $self->{hints}->{$key};
686 }
687
688
689 package Search::Estraier::Node;
690
691 use Carp qw/carp croak confess/;
692 use URI;
693 use MIME::Base64;
694 use IO::Socket::INET;
695 use URI::Escape qw/uri_escape/;
696
697 =head1 Search::Estraier::Node
698
699 =head2 new
700
701 my $node = new Search::HyperEstraier::Node;
702
703 =cut
704
705 sub new {
706 my $class = shift;
707 my $self = {
708 pxport => -1,
709 timeout => 0, # this used to be -1
710 dnum => -1,
711 wnum => -1,
712 size => -1.0,
713 wwidth => 480,
714 hwidth => 96,
715 awidth => 96,
716 status => -1,
717 };
718 bless($self, $class);
719
720 if (@_) {
721 $self->{debug} = shift;
722 warn "## Node debug on\n";
723 }
724
725 $self ? return $self : return undef;
726 }
727
728
729 =head2 set_url
730
731 Specify URL to node server
732
733 $node->set_url('http://localhost:1978');
734
735 =cut
736
737 sub set_url {
738 my $self = shift;
739 $self->{url} = shift;
740 }
741
742
743 =head2 set_proxy
744
745 Specify proxy server to connect to node server
746
747 $node->set_proxy('proxy.example.com', 8080);
748
749 =cut
750
751 sub set_proxy {
752 my $self = shift;
753 my ($host,$port) = @_;
754 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
755 $self->{pxhost} = $host;
756 $self->{pxport} = $port;
757 }
758
759
760 =head2 set_timeout
761
762 Specify timeout of connection in seconds
763
764 $node->set_timeout( 15 );
765
766 =cut
767
768 sub set_timeout {
769 my $self = shift;
770 my $sec = shift;
771 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
772 $self->{timeout} = $sec;
773 }
774
775
776 =head2 set_auth
777
778 Specify name and password for authentication to node server.
779
780 $node->set_auth('clint','eastwood');
781
782 =cut
783
784 sub set_auth {
785 my $self = shift;
786 my ($login,$passwd) = @_;
787 my $basic_auth = encode_base64( "$login:$passwd" );
788 chomp($basic_auth);
789 $self->{auth} = $basic_auth;
790 }
791
792
793 =head2 status
794
795 Return status code of last request.
796
797 print $node->status;
798
799 C<-1> means connection failure.
800
801 =cut
802
803 sub status {
804 my $self = shift;
805 return $self->{status};
806 }
807
808
809 =head2 put_doc
810
811 Add a document
812
813 $node->put_doc( $document_draft ) or die "can't add document";
814
815 Return true on success or false on failture.
816
817 =cut
818
819 sub put_doc {
820 my $self = shift;
821 my $doc = shift || return;
822 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
823 $self->shuttle_url( $self->{url} . '/put_doc',
824 'text/x-estraier-draft',
825 $doc->dump_draft,
826 undef
827 ) == 200;
828 }
829
830
831 =head2 out_doc
832
833 Remove a document
834
835 $node->out_doc( document_id ) or "can't remove document";
836
837 Return true on success or false on failture.
838
839 =cut
840
841 sub out_doc {
842 my $self = shift;
843 my $id = shift || return;
844 return unless ($self->{url});
845 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
846 $self->shuttle_url( $self->{url} . '/out_doc',
847 'application/x-www-form-urlencoded',
848 "id=$id",
849 undef
850 ) == 200;
851 }
852
853
854 =head2 out_doc_by_uri
855
856 Remove a registrated document using it's uri
857
858 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
859
860 Return true on success or false on failture.
861
862 =cut
863
864 sub out_doc_by_uri {
865 my $self = shift;
866 my $uri = shift || return;
867 return unless ($self->{url});
868 $self->shuttle_url( $self->{url} . '/out_doc',
869 'application/x-www-form-urlencoded',
870 "uri=" . uri_escape($uri),
871 undef
872 ) == 200;
873 }
874
875
876 =head2 edit_doc
877
878 Edit attributes of a document
879
880 $node->edit_doc( $document_draft ) or die "can't edit document";
881
882 Return true on success or false on failture.
883
884 =cut
885
886 sub edit_doc {
887 my $self = shift;
888 my $doc = shift || return;
889 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
890 $self->shuttle_url( $self->{url} . '/edit_doc',
891 'text/x-estraier-draft',
892 $doc->dump_draft,
893 undef
894 ) == 200;
895 }
896
897
898 =head2 get_doc
899
900 Retreive document
901
902 my $doc = $node->get_doc( document_id ) or die "can't get document";
903
904 Return true on success or false on failture.
905
906 =cut
907
908 sub get_doc {
909 my $self = shift;
910 my $id = shift || return;
911 return $self->_fetch_doc( id => $id );
912 }
913
914
915 =head2 get_doc_by_uri
916
917 Retreive document
918
919 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
920
921 Return true on success or false on failture.
922
923 =cut
924
925 sub get_doc_by_uri {
926 my $self = shift;
927 my $uri = shift || return;
928 return $self->_fetch_doc( uri => $uri );
929 }
930
931
932 =head2 get_doc_attr
933
934 Retrieve the value of an atribute from object
935
936 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
937 die "can't get document attribute";
938
939 =cut
940
941 sub get_doc_attr {
942 my $self = shift;
943 my ($id,$name) = @_;
944 return unless ($id && $name);
945 return $self->_fetch_doc( id => $id, attr => $name );
946 }
947
948
949 =head2 get_doc_attr_by_uri
950
951 Retrieve the value of an atribute from object
952
953 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
954 die "can't get document attribute";
955
956 =cut
957
958 sub get_doc_attr_by_uri {
959 my $self = shift;
960 my ($uri,$name) = @_;
961 return unless ($uri && $name);
962 return $self->_fetch_doc( uri => $uri, attr => $name );
963 }
964
965
966 =head2 etch_doc
967
968 Exctract document keywords
969
970 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
971
972 =cut
973
974 sub etch_doc {
975 my $self = shift;
976 my $id = shift || return;
977 return $self->_fetch_doc( id => $id, etch => 1 );
978 }
979
980 =head2 etch_doc_by_uri
981
982 Retreive document
983
984 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
985
986 Return true on success or false on failture.
987
988 =cut
989
990 sub etch_doc_by_uri {
991 my $self = shift;
992 my $uri = shift || return;
993 return $self->_fetch_doc( uri => $uri, etch => 1 );
994 }
995
996
997 =head2 uri_to_id
998
999 Get ID of document specified by URI
1000
1001 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1002
1003 =cut
1004
1005 sub uri_to_id {
1006 my $self = shift;
1007 my $uri = shift || return;
1008 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1009 }
1010
1011
1012 =head2 _fetch_doc
1013
1014 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1015 C<etch_doc>, C<etch_doc_by_uri>.
1016
1017 # this will decode received draft into Search::Estraier::Document object
1018 my $doc = $node->_fetch_doc( id => 42 );
1019 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1020
1021 # to extract keywords, add etch
1022 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1023 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1024
1025 # to get document attrubute add attr
1026 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1027 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1028
1029 # more general form which allows implementation of
1030 # uri_to_id
1031 my $id = $node->_fetch_doc(
1032 uri => 'file:///document/uri/42',
1033 path => '/uri_to_id',
1034 chomp_resbody => 1
1035 );
1036
1037 =cut
1038
1039 sub _fetch_doc {
1040 my $self = shift;
1041 my $a = {@_};
1042 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1043
1044 my ($arg, $resbody);
1045
1046 my $path = $a->{path} || '/get_doc';
1047 $path = '/etch_doc' if ($a->{etch});
1048
1049 if ($a->{id}) {
1050 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1051 $arg = 'id=' . $a->{id};
1052 } elsif ($a->{uri}) {
1053 $arg = 'uri=' . uri_escape($a->{uri});
1054 } else {
1055 confess "unhandled argument. Need id or uri.";
1056 }
1057
1058 if ($a->{attr}) {
1059 $path = '/get_doc_attr';
1060 $arg .= '&attr=' . uri_escape($a->{attr});
1061 $a->{chomp_resbody} = 1;
1062 }
1063
1064 my $rv = $self->shuttle_url( $self->{url} . $path,
1065 'application/x-www-form-urlencoded',
1066 $arg,
1067 \$resbody,
1068 );
1069
1070 return if ($rv != 200);
1071
1072 if ($a->{etch}) {
1073 $self->{kwords} = {};
1074 return +{} unless ($resbody);
1075 foreach my $l (split(/\n/, $resbody)) {
1076 my ($k,$v) = split(/\t/, $l, 2);
1077 $self->{kwords}->{$k} = $v if ($v);
1078 }
1079 return $self->{kwords};
1080 } elsif ($a->{chomp_resbody}) {
1081 return unless (defined($resbody));
1082 chomp($resbody);
1083 return $resbody;
1084 } else {
1085 return new Search::Estraier::Document($resbody);
1086 }
1087 }
1088
1089
1090 =head2 name
1091
1092 my $node_name = $node->name;
1093
1094 =cut
1095
1096 sub name {
1097 my $self = shift;
1098 $self->set_info unless ($self->{name});
1099 return $self->{name};
1100 }
1101
1102
1103 =head2 label
1104
1105 my $node_label = $node->label;
1106
1107 =cut
1108
1109 sub label {
1110 my $self = shift;
1111 $self->set_info unless ($self->{label});
1112 return $self->{label};
1113 }
1114
1115
1116 =head2 doc_num
1117
1118 my $documents_in_node = $node->doc_num;
1119
1120 =cut
1121
1122 sub doc_num {
1123 my $self = shift;
1124 $self->set_info if ($self->{dnum} < 0);
1125 return $self->{dnum};
1126 }
1127
1128
1129 =head2 word_num
1130
1131 my $words_in_node = $node->word_num;
1132
1133 =cut
1134
1135 sub word_num {
1136 my $self = shift;
1137 $self->set_info if ($self->{wnum} < 0);
1138 return $self->{wnum};
1139 }
1140
1141
1142 =head2 size
1143
1144 my $node_size = $node->size;
1145
1146 =cut
1147
1148 sub size {
1149 my $self = shift;
1150 $self->set_info if ($self->{size} < 0);
1151 return $self->{size};
1152 }
1153
1154
1155 =head2 search
1156
1157 Search documents which match condition
1158
1159 my $nres = $node->search( $cond, $depth );
1160
1161 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1162 depth for meta search.
1163
1164 Function results C<Search::Estraier::NodeResult> object.
1165
1166 =cut
1167
1168 sub search {
1169 my $self = shift;
1170 my ($cond, $depth) = @_;
1171 return unless ($cond && defined($depth) && $self->{url});
1172 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1173 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1174
1175 my $resbody;
1176
1177 my $rv = $self->shuttle_url( $self->{url} . '/search',
1178 'text/x-estraier-draft',
1179 $self->cond_to_query( $cond ),
1180 \$resbody,
1181 );
1182 return if ($rv != 200);
1183
1184 my (@docs, $hints);
1185
1186 my @lines = split(/\n/, $resbody);
1187 return unless (@lines);
1188
1189 my $border = $lines[0];
1190 my $isend = 0;
1191 my $lnum = 1;
1192
1193 while ( $lnum <= $#lines ) {
1194 my $line = $lines[$lnum];
1195 $lnum++;
1196
1197 #warn "## $line\n";
1198 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1199 $isend = $1;
1200 last;
1201 }
1202
1203 if ($line =~ /\t/) {
1204 my ($k,$v) = split(/\t/, $line, 2);
1205 $hints->{$k} = $v;
1206 }
1207 }
1208
1209 my $snum = $lnum;
1210
1211 while( ! $isend && $lnum <= $#lines ) {
1212 my $line = $lines[$lnum];
1213 $lnum++;
1214
1215 if ($line && $line =~ m/^\Q$border\E/) {
1216 if ($lnum > $snum) {
1217 my $rdattrs;
1218 my $rdvector;
1219 my $rdsnippet;
1220
1221 my $rlnum = $snum;
1222 while ($rlnum < $lnum - 1 ) {
1223 #my $rdline = $self->_s($lines[$rlnum]);
1224 my $rdline = $lines[$rlnum];
1225 $rlnum++;
1226 last unless ($rdline);
1227 if ($rdline =~ /^%/) {
1228 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1229 } else {
1230 $rdattrs->{$1} = {$2} if ($line =~ /^(.+)=(.+)$/);
1231 }
1232 }
1233 while($rlnum < $lnum - 1) {
1234 my $rdline = $lines[$rlnum];
1235 $rlnum++;
1236 $rdsnippet .= "$rdline\n";
1237 }
1238 if (my $rduri = $rdattrs->{'@uri'}) {
1239 push @docs, new Search::Estraier::ResultDocument(
1240 uri => $rduri,
1241 attrs => $rdattrs,
1242 snippet => $rdsnippet,
1243 keywords => $rdvector,
1244 );
1245 }
1246 }
1247 $snum = $lnum;
1248 #warn "### $line\n";
1249 $isend = 1 if ($line =~ /:END$/);
1250 }
1251
1252 if (! $isend) {
1253 warn "received result doesn't have :END\n$resbody";
1254 return;
1255 }
1256 }
1257
1258 if (! $isend) {
1259 warn "received result doesn't have :END\n$resbody";
1260 return;
1261 }
1262
1263
1264 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1265 }
1266
1267
1268 =head2 cond_to_query
1269
1270 my $args = $node->cond_to_query( $cond );
1271
1272 =cut
1273
1274 sub cond_to_query {
1275 my $self = shift;
1276
1277 my $cond = shift || return;
1278 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1279
1280 my @args;
1281
1282 if (my $phrase = $cond->phrase) {
1283 push @args, 'phrase=' . uri_escape($phrase);
1284 }
1285
1286 if (my @attrs = $cond->attrs) {
1287 for my $i ( 0 .. $#attrs ) {
1288 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] );
1289 }
1290 }
1291
1292 if (my $order = $cond->order) {
1293 push @args, 'order=' . uri_escape($order);
1294 }
1295
1296 if (my $max = $cond->max) {
1297 push @args, 'max=' . $max;
1298 } else {
1299 push @args, 'max=' . (1 << 30);
1300 }
1301
1302 if (my $options = $cond->options) {
1303 push @args, 'options=' . $options;
1304 }
1305
1306 push @args, 'depth=' . $self->{depth} if ($self->{depth});
1307 push @args, 'wwidth=' . $self->{wwidth};
1308 push @args, 'hwidth=' . $self->{hwidth};
1309 push @args, 'awidth=' . $self->{awidth};
1310
1311 return join('&', @args);
1312 }
1313
1314
1315 =head2 shuttle_url
1316
1317 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1318 master.
1319
1320 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1321
1322 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1323 body will be saved within object.
1324
1325 =cut
1326
1327 sub shuttle_url {
1328 my $self = shift;
1329
1330 my ($url, $content_type, $reqbody, $resbody) = @_;
1331
1332 $self->{status} = -1;
1333
1334 warn "## $url\n" if ($self->{debug});
1335
1336 $url = new URI($url);
1337 if (
1338 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1339 !$url->host || !$url->port || $url->port < 1
1340 ) {
1341 carp "can't parse $url\n";
1342 return -1;
1343 }
1344
1345 my ($host,$port,$query) = ($url->host, $url->port, $url->path);
1346
1347 if ($self->{pxhost}) {
1348 ($host,$port) = ($self->{pxhost}, $self->{pxport});
1349 $query = "http://$host:$port/$query";
1350 }
1351
1352 $query .= '?' . $url->query if ($url->query && ! $reqbody);
1353
1354 my $headers;
1355
1356 if ($reqbody) {
1357 $headers .= "POST $query HTTP/1.0\r\n";
1358 } else {
1359 $headers .= "GET $query HTTP/1.0\r\n";
1360 }
1361
1362 $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n";
1363 $headers .= "Connection: close\r\n";
1364 $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";
1365 $headers .= "Content-Type: $content_type\r\n";
1366 $headers .= "Authorization: Basic $self->{auth}\r\n";
1367 my $len = 0;
1368 {
1369 use bytes;
1370 $len = length($reqbody) if ($reqbody);
1371 }
1372 $headers .= "Content-Length: $len\r\n";
1373 $headers .= "\r\n";
1374
1375 my $sock = IO::Socket::INET->new(
1376 PeerAddr => $host,
1377 PeerPort => $port,
1378 Proto => 'tcp',
1379 Timeout => $self->{timeout} || 90,
1380 );
1381
1382 if (! $sock) {
1383 carp "can't open socket to $host:$port";
1384 return -1;
1385 }
1386
1387 warn $headers if ($self->{debug});
1388
1389 print $sock $headers or
1390 carp "can't send headers to network:\n$headers\n" and return -1;
1391
1392 if ($reqbody) {
1393 warn "$reqbody\n" if ($self->{debug});
1394 print $sock $reqbody or
1395 carp "can't send request body to network:\n$$reqbody\n" and return -1;
1396 }
1397
1398 my $line = <$sock>;
1399 chomp($line);
1400 my ($schema, $res_status, undef) = split(/ */, $line, 3);
1401 return if ($schema !~ /^HTTP/ || ! $res_status);
1402
1403 $self->{status} = $res_status;
1404 warn "## response status: $res_status\n" if ($self->{debug});
1405
1406 # skip rest of headers
1407 $line = <$sock>;
1408 while ($line) {
1409 $line = <$sock>;
1410 $line =~ s/[\r\n]+$//;
1411 warn "## ", $line || 'NULL', " ##\n" if ($self->{debug});
1412 };
1413
1414 # read body
1415 $len = 0;
1416 do {
1417 $len = read($sock, my $buf, 8192);
1418 $$resbody .= $buf if ($resbody);
1419 } while ($len);
1420
1421 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1422
1423 return $self->{status};
1424 }
1425
1426
1427 =head2 set_info
1428
1429 Set information for node
1430
1431 $node->set_info;
1432
1433 =cut
1434
1435 sub set_info {
1436 my $self = shift;
1437
1438 $self->{status} = -1;
1439 return unless ($self->{url});
1440
1441 my $resbody;
1442 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1443 'text/plain',
1444 undef,
1445 \$resbody,
1446 );
1447
1448 return if ($rv != 200 || !$resbody);
1449
1450 chomp($resbody);
1451
1452 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1453 split(/\t/, $resbody, 5);
1454
1455 }
1456
1457 ###
1458
1459 =head1 EXPORT
1460
1461 Nothing.
1462
1463 =head1 SEE ALSO
1464
1465 L<http://hyperestraier.sourceforge.net/>
1466
1467 Hyper Estraier Ruby interface on which this module is based.
1468
1469 =head1 AUTHOR
1470
1471 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1472
1473
1474 =head1 COPYRIGHT AND LICENSE
1475
1476 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1477
1478 This library is free software; you can redistribute it and/or modify
1479 it under the GPL v2 or later.
1480
1481 =cut
1482
1483 1;

  ViewVC Help
Powered by ViewVC 1.1.26