/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 48 - (show annotations)
Fri Jan 6 02:07:10 2006 UTC (18 years, 1 month ago) by dpavlin
File size: 21239 byte(s)
added name, label, doc_num, word_num and size properties for which I had to
implement set_info.
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 use Search::Estraier;
16 my $est = new Search::Estraier();
17
18 =head1 DESCRIPTION
19
20 This module is implementation of node API of Hyper Estraier. Since it's
21 perl-only module with dependencies only on standard perl modules, it will
22 run on all platforms on which perl runs. It doesn't require compilation
23 or Hyper Estraier development files on target machine.
24
25 It is implemented as multiple packages which closly resamble Ruby
26 implementation. It also includes methods to manage nodes.
27
28 =cut
29
30 =head1 Inheritable common methods
31
32 This methods should really move somewhere else.
33
34 =head2 _s
35
36 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
37
38 my $text = $self->_s(" this is a text ");
39 $text = 'this is a text';
40
41 =cut
42
43 sub _s {
44 my $text = $_[1] || return;
45 $text =~ s/\s\s+/ /gs;
46 $text =~ s/^\s+//;
47 $text =~ s/\s+$//;
48 return $text;
49 }
50
51 package Search::Estraier::Document;
52
53 use Carp qw/croak confess/;
54
55 use Search::Estraier;
56 our @ISA = qw/Search::Estraier/;
57
58 =head1 Search::Estraier::Document
59
60 This class implements Document which is collection of attributes
61 (key=value), vectors (also key value) display text and hidden text.
62
63
64 =head2 new
65
66 Create new document, empty or from draft.
67
68 my $doc = new Search::HyperEstraier::Document;
69 my $doc2 = new Search::HyperEstraier::Document( $draft );
70
71 =cut
72
73 sub new {
74 my $class = shift;
75 my $self = {};
76 bless($self, $class);
77
78 $self->{id} = -1;
79
80 my $draft = shift;
81
82 if ($draft) {
83 my $in_text = 0;
84 foreach my $line (split(/\n/, $draft)) {
85
86 if ($in_text) {
87 if ($line =~ /^\t/) {
88 push @{ $self->{htexts} }, substr($line, 1);
89 } else {
90 push @{ $self->{dtexts} }, $line;
91 }
92 next;
93 }
94
95 if ($line =~ m/^%VECTOR\t(.+)$/) {
96 my @fields = split(/\t/, $1);
97 for my $i ( 0 .. ($#fields - 1) ) {
98 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
99 $i++;
100 }
101 next;
102 } elsif ($line =~ m/^%/) {
103 # What is this? comment?
104 #warn "$line\n";
105 next;
106 } elsif ($line =~ m/^$/) {
107 $in_text = 1;
108 next;
109 } elsif ($line =~ m/^(.+)=(.+)$/) {
110 $self->{attrs}->{ $1 } = $2;
111 next;
112 }
113
114 warn "draft ignored: $line\n";
115 }
116 }
117
118 $self ? return $self : return undef;
119 }
120
121
122 =head2 add_attr
123
124 Add an attribute.
125
126 $doc->add_attr( name => 'value' );
127
128 Delete attribute using
129
130 $doc->add_attr( name => undef );
131
132 =cut
133
134 sub add_attr {
135 my $self = shift;
136 my $attrs = {@_};
137
138 while (my ($name, $value) = each %{ $attrs }) {
139 if (! defined($value)) {
140 delete( $self->{attrs}->{ $self->_s($name) } );
141 } else {
142 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
143 }
144 }
145
146 return 1;
147 }
148
149
150 =head2 add_text
151
152 Add a sentence of text.
153
154 $doc->add_text('this is example text to display');
155
156 =cut
157
158 sub add_text {
159 my $self = shift;
160 my $text = shift;
161 return unless defined($text);
162
163 push @{ $self->{dtexts} }, $self->_s($text);
164 }
165
166
167 =head2 add_hidden_text
168
169 Add a hidden sentence.
170
171 $doc->add_hidden_text('this is example text just for search');
172
173 =cut
174
175 sub add_hidden_text {
176 my $self = shift;
177 my $text = shift;
178 return unless defined($text);
179
180 push @{ $self->{htexts} }, $self->_s($text);
181 }
182
183
184 =head2 id
185
186 Get the ID number of document. If the object has never been registred, C<-1> is returned.
187
188 print $doc->id;
189
190 =cut
191
192 sub id {
193 my $self = shift;
194 return $self->{id};
195 }
196
197
198 =head2 attr_names
199
200 Returns array with attribute names from document object.
201
202 my @attrs = $doc->attr_names;
203
204 =cut
205
206 sub attr_names {
207 my $self = shift;
208 croak "attr_names return array, not scalar" if (! wantarray);
209 return sort keys %{ $self->{attrs} };
210 }
211
212
213 =head2 attr
214
215 Returns value of an attribute.
216
217 my $value = $doc->attr( 'attribute' );
218
219 =cut
220
221 sub attr {
222 my $self = shift;
223 my $name = shift;
224
225 return $self->{'attrs'}->{ $name };
226 }
227
228
229 =head2 texts
230
231 Returns array with text sentences.
232
233 my @texts = $doc->texts;
234
235 =cut
236
237 sub texts {
238 my $self = shift;
239 confess "texts return array, not scalar" if (! wantarray);
240 return @{ $self->{dtexts} };
241 }
242
243
244 =head2 cat_texts
245
246 Return whole text as single scalar.
247
248 my $text = $doc->cat_texts;
249
250 =cut
251
252 sub cat_texts {
253 my $self = shift;
254 return join(' ',@{ $self->{dtexts} });
255 }
256
257
258 =head2 dump_draft
259
260 Dump draft data from document object.
261
262 print $doc->dump_draft;
263
264 =cut
265
266 sub dump_draft {
267 my $self = shift;
268 my $draft;
269
270 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
271 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
272 }
273
274 if ($self->{kwords}) {
275 $draft .= '%%VECTOR';
276 while (my ($key, $value) = each %{ $self->{kwords} }) {
277 $draft .= "\t$key\t$value";
278 }
279 $draft .= "\n";
280 }
281
282 $draft .= "\n";
283
284 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
285 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
286
287 return $draft;
288 }
289
290
291 =head2 delete
292
293 Empty document object
294
295 $doc->delete;
296
297 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
298 convinience. Document objects which go out of scope will be destroyed
299 automatically.
300
301 =cut
302
303 sub delete {
304 my $self = shift;
305
306 foreach my $data (qw/attrs dtexts stexts kwords/) {
307 delete($self->{$data});
308 }
309
310 $self->{id} = -1;
311
312 return 1;
313 }
314
315
316
317 package Search::Estraier::Condition;
318
319 use Carp qw/confess croak/;
320
321 use Search::Estraier;
322 our @ISA = qw/Search::Estraier/;
323
324 =head1 Search::Estraier::Condition
325
326 =head2 new
327
328 my $cond = new Search::HyperEstraier::Condition;
329
330 =cut
331
332 sub new {
333 my $class = shift;
334 my $self = {};
335 bless($self, $class);
336
337 $self->{max} = -1;
338 $self->{options} = 0;
339
340 $self ? return $self : return undef;
341 }
342
343
344 =head2 set_phrase
345
346 $cond->set_phrase('search phrase');
347
348 =cut
349
350 sub set_phrase {
351 my $self = shift;
352 $self->{phrase} = $self->_s( shift );
353 }
354
355
356 =head2 add_attr
357
358 $cond->add_attr('@URI STRINC /~dpavlin/');
359
360 =cut
361
362 sub add_attr {
363 my $self = shift;
364 my $attr = shift || return;
365 push @{ $self->{attrs} }, $self->_s( $attr );
366 }
367
368
369 =head2 set_order
370
371 $cond->set_order('@mdate NUMD');
372
373 =cut
374
375 sub set_order {
376 my $self = shift;
377 $self->{order} = shift;
378 }
379
380
381 =head2 set_max
382
383 $cond->set_max(42);
384
385 =cut
386
387 sub set_max {
388 my $self = shift;
389 my $max = shift;
390 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
391 $self->{max} = $max;
392 }
393
394
395 =head2 set_options
396
397 $cond->set_options( SURE => 1 );
398
399 =cut
400
401 my $options = {
402 # check N-gram keys skipping by three
403 SURE => 1 << 0,
404 # check N-gram keys skipping by two
405 USUAL => 1 << 1,
406 # without TF-IDF tuning
407 FAST => 1 << 2,
408 # with the simplified phrase
409 AGITO => 1 << 3,
410 # check every N-gram key
411 NOIDF => 1 << 4,
412 # check N-gram keys skipping by one
413 SIMPLE => 1 << 10,
414 };
415
416 sub set_options {
417 my $self = shift;
418 my $option = shift;
419 confess "unknown option" unless ($options->{$option});
420 $self->{options} ||= $options->{$option};
421 }
422
423
424 =head2 phrase
425
426 Return search phrase.
427
428 print $cond->phrase;
429
430 =cut
431
432 sub phrase {
433 my $self = shift;
434 return $self->{phrase};
435 }
436
437
438 =head2 order
439
440 Return search result order.
441
442 print $cond->order;
443
444 =cut
445
446 sub order {
447 my $self = shift;
448 return $self->{order};
449 }
450
451
452 =head2 attrs
453
454 Return search result attrs.
455
456 my @cond_attrs = $cond->attrs;
457
458 =cut
459
460 sub attrs {
461 my $self = shift;
462 #croak "attrs return array, not scalar" if (! wantarray);
463 return @{ $self->{attrs} };
464 }
465
466
467 =head2 max
468
469 Return maximum number of results.
470
471 print $cond->max;
472
473 C<-1> is returned for unitialized value, C<0> is unlimited.
474
475 =cut
476
477 sub max {
478 my $self = shift;
479 return $self->{max};
480 }
481
482
483 =head2 options
484
485 Return options for this condition.
486
487 print $cond->options;
488
489 Options are returned in numerical form.
490
491 =cut
492
493 sub options {
494 my $self = shift;
495 return $self->{options};
496 }
497
498
499 package Search::Estraier::ResultDocument;
500
501 use Carp qw/croak/;
502
503 #use Search::Estraier;
504 #our @ISA = qw/Search::Estraier/;
505
506 =head1 Search::Estraier::ResultDocument
507
508 =head2 new
509
510 my $rdoc = new Search::HyperEstraier::ResultDocument(
511 uri => 'http://localhost/document/uri/42',
512 attrs => {
513 foo => 1,
514 bar => 2,
515 },
516 snippet => 'this is a text of snippet'
517 keywords => 'this\tare\tkeywords'
518 );
519
520 =cut
521
522 sub new {
523 my $class = shift;
524 my $self = {@_};
525 bless($self, $class);
526
527 foreach my $f (qw/uri attrs snippet keywords/) {
528 croak "missing $f for ResultDocument" unless defined($self->{$f});
529 }
530
531 $self ? return $self : return undef;
532 }
533
534
535 =head2 uri
536
537 Return URI of result document
538
539 print $rdoc->uri;
540
541 =cut
542
543 sub uri {
544 my $self = shift;
545 return $self->{uri};
546 }
547
548
549 =head2 attr_names
550
551 Returns array with attribute names from result document object.
552
553 my @attrs = $rdoc->attr_names;
554
555 =cut
556
557 sub attr_names {
558 my $self = shift;
559 croak "attr_names return array, not scalar" if (! wantarray);
560 return sort keys %{ $self->{attrs} };
561 }
562
563
564 =head2 attr
565
566 Returns value of an attribute.
567
568 my $value = $rdoc->attr( 'attribute' );
569
570 =cut
571
572 sub attr {
573 my $self = shift;
574 my $name = shift || return;
575 return $self->{attrs}->{ $name };
576 }
577
578
579 =head2 snippet
580
581 Return snippet from result document
582
583 print $rdoc->snippet;
584
585 =cut
586
587 sub snippet {
588 my $self = shift;
589 return $self->{snippet};
590 }
591
592
593 =head2 keywords
594
595 Return keywords from result document
596
597 print $rdoc->keywords;
598
599 =cut
600
601 sub keywords {
602 my $self = shift;
603 return $self->{keywords};
604 }
605
606
607 package Search::Estraier::NodeResult;
608
609 use Carp qw/croak/;
610
611 #use Search::Estraier;
612 #our @ISA = qw/Search::Estraier/;
613
614 =head1 Search::Estraier::NodeResult
615
616 =head2 new
617
618 my $res = new Search::HyperEstraier::NodeResult(
619 docs => @array_of_rdocs,
620 hits => %hash_with_hints,
621 );
622
623 =cut
624
625 sub new {
626 my $class = shift;
627 my $self = {@_};
628 bless($self, $class);
629
630 foreach my $f (qw/docs hints/) {
631 croak "missing $f for ResultDocument" unless defined($self->{$f});
632 }
633
634 $self ? return $self : return undef;
635 }
636
637
638 =head2 doc_num
639
640 Return number of documents
641
642 print $res->doc_num;
643
644 =cut
645
646 sub doc_num {
647 my $self = shift;
648 return $#{$self->{docs}};
649 }
650
651
652 =head2 get_doc
653
654 Return single document
655
656 my $doc = $res->get_doc( 42 );
657
658 Returns undef if document doesn't exist.
659
660 =cut
661
662 sub get_doc {
663 my $self = shift;
664 my $num = shift;
665 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
666 return undef if ($num < 0 || $num > $self->{docs});
667 return $self->{docs}->[$num];
668 }
669
670
671 =head2 hint
672
673 Return specific hint from results.
674
675 print $rec->hint( 'VERSION' );
676
677 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
678 C<TIME>, C<LINK#n>, C<VIEW>.
679
680 =cut
681
682 sub hint {
683 my $self = shift;
684 my $key = shift || return;
685 return $self->{hints}->{$key};
686 }
687
688
689 package Search::Estraier::Node;
690
691 use Carp qw/carp croak confess/;
692 use URI;
693 use MIME::Base64;
694 use IO::Socket::INET;
695
696 =head1 Search::Estraier::Node
697
698 =head2 new
699
700 my $node = new Search::HyperEstraier::Node;
701
702 =cut
703
704 sub new {
705 my $class = shift;
706 my $self = {
707 pxport => -1,
708 timeout => 0, # this used to be -1
709 dnum => -1,
710 wnum => -1,
711 size => -1.0,
712 wwidth => 480,
713 hwidth => 96,
714 awidth => 96,
715 status => -1,
716 };
717 bless($self, $class);
718
719 if (@_) {
720 $self->{debug} = shift;
721 warn "## Node debug on\n";
722 }
723
724 $self ? return $self : return undef;
725 }
726
727
728 =head2 set_url
729
730 Specify URL to node server
731
732 $node->set_url('http://localhost:1978');
733
734 =cut
735
736 sub set_url {
737 my $self = shift;
738 $self->{url} = shift;
739 }
740
741
742 =head2 set_proxy
743
744 Specify proxy server to connect to node server
745
746 $node->set_proxy('proxy.example.com', 8080);
747
748 =cut
749
750 sub set_proxy {
751 my $self = shift;
752 my ($host,$port) = @_;
753 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
754 $self->{pxhost} = $host;
755 $self->{pxport} = $port;
756 }
757
758
759 =head2 set_timeout
760
761 Specify timeout of connection in seconds
762
763 $node->set_timeout( 15 );
764
765 =cut
766
767 sub set_timeout {
768 my $self = shift;
769 my $sec = shift;
770 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
771 $self->{timeout} = $sec;
772 }
773
774
775 =head2 set_auth
776
777 Specify name and password for authentication to node server.
778
779 $node->set_auth('clint','eastwood');
780
781 =cut
782
783 sub set_auth {
784 my $self = shift;
785 my ($login,$passwd) = @_;
786 my $basic_auth = encode_base64( "$login:$passwd" );
787 chomp($basic_auth);
788 $self->{auth} = $basic_auth;
789 }
790
791
792 =head2 status
793
794 Return status code of last request.
795
796 print $node->status;
797
798 C<-1> means connection failure.
799
800 =cut
801
802 sub status {
803 my $self = shift;
804 return $self->{status};
805 }
806
807
808 =head2 put_doc
809
810 Add a document
811
812 $node->put_doc( $document_draft ) or die "can't add document";
813
814 Return true on success or false on failture.
815
816 =cut
817
818 sub put_doc {
819 my $self = shift;
820 my $doc = shift || return;
821 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
822 $self->shuttle_url( $self->{url} . '/put_doc',
823 'text/x-estraier-draft',
824 $doc->dump_draft,
825 undef
826 ) == 200;
827 }
828
829
830 =head2 out_doc
831
832 Remove a document
833
834 $node->out_doc( document_id ) or "can't remove document";
835
836 Return true on success or false on failture.
837
838 =cut
839
840 sub out_doc {
841 my $self = shift;
842 my $id = shift || return;
843 return unless ($self->{url});
844 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
845 $self->shuttle_url( $self->{url} . '/out_doc',
846 'application/x-www-form-urlencoded',
847 "id=$id",
848 undef
849 ) == 200;
850 }
851
852
853 =head2 out_doc_by_uri
854
855 Remove a registrated document using it's uri
856
857 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
858
859 Return true on success or false on failture.
860
861 =cut
862
863 sub out_doc_by_uri {
864 my $self = shift;
865 my $uri = shift || return;
866 return unless ($self->{url});
867 $self->shuttle_url( $self->{url} . '/out_doc',
868 'application/x-www-form-urlencoded',
869 "uri=$uri",
870 undef
871 ) == 200;
872 }
873
874
875 =head2 edit_doc
876
877 Edit attributes of a document
878
879 $node->edit_doc( $document_draft ) or die "can't edit document";
880
881 Return true on success or false on failture.
882
883 =cut
884
885 sub edit_doc {
886 my $self = shift;
887 my $doc = shift || return;
888 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
889 $self->shuttle_url( $self->{url} . '/edit_doc',
890 'text/x-estraier-draft',
891 $doc->dump_draft,
892 undef
893 ) == 200;
894 }
895
896
897 =head2 get_doc
898
899 Retreive document
900
901 my $doc = $node->get_doc( document_id ) or die "can't get document";
902
903 Return true on success or false on failture.
904
905 =cut
906
907 sub get_doc {
908 my $self = shift;
909 my $id = shift || return;
910 return $self->_fetch_doc( id => $id );
911 }
912
913
914 =head2 get_doc_by_uri
915
916 Retreive document
917
918 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
919
920 Return true on success or false on failture.
921
922 =cut
923
924 sub get_doc_by_uri {
925 my $self = shift;
926 my $uri = shift || return;
927 return $self->_fetch_doc( uri => $uri );
928 }
929
930
931 =head2 etch_doc
932
933 Exctract document keywords
934
935 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
936
937 =cut
938
939 sub erch_doc {
940 my $self = shift;
941 my $id = shift || return;
942 return $self->_fetch_doc( id => $id, etch => 1 );
943 }
944
945 =head2 etch_doc_by_uri
946
947 Retreive document
948
949 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
950
951 Return true on success or false on failture.
952
953 =cut
954
955 sub etch_doc_by_uri {
956 my $self = shift;
957 my $uri = shift || return;
958 return $self->_fetch_doc( uri => $uri, etch => 1 );
959 }
960
961
962 =head2 uri_to_id
963
964 Get ID of document specified by URI
965
966 my $id = $node->uri_to_id( 'file:///document/uri/42' );
967
968 =cut
969
970 sub uri_to_id {
971 my $self = shift;
972 my $uri = shift || return;
973 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
974 }
975
976
977 =head2 _fetch_doc
978
979 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
980 C<etch_doc>, C<etch_doc_by_uri>.
981
982 # this will decode received draft into Search::Estraier::Document object
983 my $doc = $node->_fetch_doc( id => 42 );
984 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
985
986 # to extract keywords, add etch
987 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
988 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
989
990 # more general form which allows implementation of
991 # uri_to_id
992 my $id = $node->_fetch_doc(
993 uri => 'file:///document/uri/42',
994 path => '/uri_to_id',
995 chomp_resbody => 1
996 );
997
998 =cut
999
1000 sub _fetch_doc {
1001 my $self = shift;
1002 my $a = {@_};
1003 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1004
1005 my ($arg, $resbody);
1006
1007 my $path = $a->{path} || '/get_doc';
1008 $path = '/etch_doc' if ($a->{etch});
1009
1010 if ($a->{id}) {
1011 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1012 $arg = 'id=' . $a->{id};
1013 } elsif ($a->{uri}) {
1014 $arg = 'uri=' . $a->{uri};
1015 } else {
1016 confess "unhandled argument. Need id or uri.";
1017 }
1018
1019 my $rv = $self->shuttle_url( $self->{url} . $path,
1020 'application/x-www-form-urlencoded',
1021 $arg,
1022 \$resbody,
1023 );
1024
1025 return if ($rv != 200);
1026
1027 if ($a->{etch}) {
1028 $self->{kwords} = {};
1029 return +{} unless ($resbody);
1030 foreach my $l (split(/\n/, $resbody)) {
1031 my ($k,$v) = split(/\t/, $l, 2);
1032 $self->{kwords}->{$k} = $v if ($v);
1033 }
1034 return $self->{kwords};
1035 } elsif ($a->{chomp_resbody}) {
1036 return unless (defined($resbody));
1037 chomp($resbody);
1038 return $resbody;
1039 } else {
1040 return new Search::Estraier::Document($resbody);
1041 }
1042 }
1043
1044
1045 =head2 name
1046
1047 my $node_name = $node->name;
1048
1049 =cut
1050
1051 sub name {
1052 my $self = shift;
1053 $self->set_info unless ($self->{name});
1054 return $self->{name};
1055 }
1056
1057
1058 =head2 label
1059
1060 my $node_label = $node->label;
1061
1062 =cut
1063
1064 sub label {
1065 my $self = shift;
1066 $self->set_info unless ($self->{label});
1067 return $self->{label};
1068 }
1069
1070
1071 =head2 doc_num
1072
1073 my $documents_in_node = $node->doc_num;
1074
1075 =cut
1076
1077 sub doc_num {
1078 my $self = shift;
1079 $self->set_info if ($self->{dnum} < 0);
1080 return $self->{dnum};
1081 }
1082
1083
1084 =head2 word_num
1085
1086 my $words_in_node = $node->word_num;
1087
1088 =cut
1089
1090 sub word_num {
1091 my $self = shift;
1092 $self->set_info if ($self->{wnum} < 0);
1093 return $self->{wnum};
1094 }
1095
1096
1097 =head2 size
1098
1099 my $node_size = $node->size;
1100
1101 =cut
1102
1103 sub size {
1104 my $self = shift;
1105 $self->set_info if ($self->{size} < 0);
1106 return $self->{size};
1107 }
1108
1109
1110
1111 =head2 shuttle_url
1112
1113 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
1114 master.
1115
1116 my $rv = shuttle_url( $url, $content_type, \$req_body, \$resbody );
1117
1118 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1119 body will be saved within object.
1120
1121 =cut
1122
1123 sub shuttle_url {
1124 my $self = shift;
1125
1126 my ($url, $content_type, $reqbody, $resbody) = @_;
1127
1128 $self->{status} = -1;
1129
1130 warn "## $url\n" if ($self->{debug});
1131
1132 $url = new URI($url);
1133 if (
1134 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1135 !$url->host || !$url->port || $url->port < 1
1136 ) {
1137 carp "can't parse $url\n";
1138 return -1;
1139 }
1140
1141 my ($host,$port,$query) = ($url->host, $url->port, $url->path);
1142
1143 if ($self->{pxhost}) {
1144 ($host,$port) = ($self->{pxhost}, $self->{pxport});
1145 $query = "http://$host:$port/$query";
1146 }
1147
1148 $query .= '?' . $url->query if ($url->query && ! $reqbody);
1149
1150 my $headers;
1151
1152 if ($reqbody) {
1153 $headers .= "POST $query HTTP/1.0\r\n";
1154 } else {
1155 $headers .= "GET $query HTTP/1.0\r\n";
1156 }
1157
1158 $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n";
1159 $headers .= "Connection: close\r\n";
1160 $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";
1161 $headers .= "Content-Type: $content_type\r\n";
1162 $headers .= "Authorization: Basic $self->{auth}\r\n";
1163 my $len = 0;
1164 {
1165 use bytes;
1166 $len = length($reqbody) if ($reqbody);
1167 }
1168 $headers .= "Content-Length: $len\r\n";
1169 $headers .= "\r\n";
1170
1171 my $sock = IO::Socket::INET->new(
1172 PeerAddr => $host,
1173 PeerPort => $port,
1174 Proto => 'tcp',
1175 Timeout => $self->{timeout} || 90,
1176 );
1177
1178 if (! $sock) {
1179 carp "can't open socket to $host:$port";
1180 return -1;
1181 }
1182
1183 warn $headers if ($self->{debug});
1184
1185 print $sock $headers or
1186 carp "can't send headers to network:\n$headers\n" and return -1;
1187
1188 if ($reqbody) {
1189 warn "$reqbody\n" if ($self->{debug});
1190 print $sock $reqbody or
1191 carp "can't send request body to network:\n$$reqbody\n" and return -1;
1192 }
1193
1194 my $line = <$sock>;
1195 chomp($line);
1196 my ($schema, $res_status, undef) = split(/ */, $line, 3);
1197 return if ($schema !~ /^HTTP/ || ! $res_status);
1198
1199 $self->{status} = $res_status;
1200 warn "## response status: $res_status\n" if ($self->{debug});
1201
1202 # skip rest of headers
1203 $line = <$sock>;
1204 while ($line) {
1205 $line = <$sock>;
1206 $line =~ s/[\r\n]+$//;
1207 warn "## ", $line || 'NULL', " ##\n" if ($self->{debug});
1208 };
1209
1210 # read body
1211 $len = 0;
1212 do {
1213 $len = read($sock, my $buf, 8192);
1214 $$resbody .= $buf if ($resbody);
1215 } while ($len);
1216
1217 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1218
1219 return $self->{status};
1220 }
1221
1222
1223 =head2 set_info
1224
1225 Set information for node
1226
1227 $node->set_info;
1228
1229 =cut
1230
1231 sub set_info {
1232 my $self = shift;
1233
1234 $self->{status} = -1;
1235 return unless ($self->{url});
1236
1237 my $resbody;
1238 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1239 'text/plain',
1240 undef,
1241 \$resbody,
1242 );
1243
1244 return if ($rv != 200 || !$resbody);
1245
1246 chomp($resbody);
1247
1248 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1249 split(/\t/, $resbody, 5);
1250
1251 }
1252
1253 ###
1254
1255 =head1 EXPORT
1256
1257 Nothing.
1258
1259 =head1 SEE ALSO
1260
1261 L<http://hyperestraier.sourceforge.net/>
1262
1263 Hyper Estraier Ruby interface on which this module is based.
1264
1265 =head1 AUTHOR
1266
1267 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1268
1269
1270 =head1 COPYRIGHT AND LICENSE
1271
1272 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1273
1274 This library is free software; you can redistribute it and/or modify
1275 it under the GPL v2 or later.
1276
1277 =cut
1278
1279 1;

  ViewVC Help
Powered by ViewVC 1.1.26