/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (show annotations)
Thu Jan 5 23:32:31 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 16313 byte(s)
out_doc, out_doc_by_uri
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 use Search::Estraier;
16 my $est = new Search::Estraier();
17
18 =head1 DESCRIPTION
19
20 This module is implementation of node API of Hyper Estraier. Since it's
21 perl-only module with dependencies only on standard perl modules, it will
22 run on all platforms on which perl runs. It doesn't require compilation
23 or Hyper Estraier development files on target machine.
24
25 It is implemented as multiple packages which closly resamble Ruby
26 implementation. It also includes methods to manage nodes.
27
28 =cut
29
30 =head2 _s
31
32 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
33
34 my $text = $self->_s(" this is a text ");
35 $text = 'this is a text';
36
37 =cut
38
39 sub _s {
40 my $text = $_[1] || return;
41 $text =~ s/\s\s+/ /gs;
42 $text =~ s/^\s+//;
43 $text =~ s/\s+$//;
44 return $text;
45 }
46
47 package Search::Estraier::Document;
48
49 use Carp qw/croak confess/;
50
51 use Search::Estraier;
52 our @ISA = qw/Search::Estraier/;
53
54 =head1 Search::Estraier::Document
55
56 This class implements Document which is collection of attributes
57 (key=value), vectors (also key value) display text and hidden text.
58
59 =head2 new
60
61 Create new document, empty or from draft.
62
63 my $doc = new Search::HyperEstraier::Document;
64 my $doc2 = new Search::HyperEstraier::Document( $draft );
65
66 =cut
67
68 sub new {
69 my $class = shift;
70 my $self = {};
71 bless($self, $class);
72
73 $self->{id} = -1;
74
75 my $draft = shift;
76
77 if ($draft) {
78 my $in_text = 0;
79 foreach my $line (split(/\n/, $draft)) {
80
81 if ($in_text) {
82 if ($line =~ /^\t/) {
83 push @{ $self->{htexts} }, substr($line, 1);
84 } else {
85 push @{ $self->{dtexts} }, $line;
86 }
87 next;
88 }
89
90 if ($line =~ m/^%VECTOR\t(.+)$/) {
91 my @fields = split(/\t/, $1);
92 for my $i ( 0 .. ($#fields - 1) ) {
93 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
94 $i++;
95 }
96 next;
97 } elsif ($line =~ m/^%/) {
98 # What is this? comment?
99 #warn "$line\n";
100 next;
101 } elsif ($line =~ m/^$/) {
102 $in_text = 1;
103 next;
104 } elsif ($line =~ m/^(.+)=(.+)$/) {
105 $self->{attrs}->{ $1 } = $2;
106 next;
107 }
108
109 warn "draft ignored: $line\n";
110 }
111 }
112
113 $self ? return $self : return undef;
114 }
115
116
117 =head2 add_attr
118
119 Add an attribute.
120
121 $doc->add_attr( name => 'value' );
122
123 Delete attribute using
124
125 $doc->add_attr( name => undef );
126
127 =cut
128
129 sub add_attr {
130 my $self = shift;
131 my $attrs = {@_};
132
133 while (my ($name, $value) = each %{ $attrs }) {
134 if (! defined($value)) {
135 delete( $self->{attrs}->{ $self->_s($name) } );
136 } else {
137 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
138 }
139 }
140
141 return 1;
142 }
143
144
145 =head2 add_text
146
147 Add a sentence of text.
148
149 $doc->add_text('this is example text to display');
150
151 =cut
152
153 sub add_text {
154 my $self = shift;
155 my $text = shift;
156 return unless defined($text);
157
158 push @{ $self->{dtexts} }, $self->_s($text);
159 }
160
161
162 =head2 add_hidden_text
163
164 Add a hidden sentence.
165
166 $doc->add_hidden_text('this is example text just for search');
167
168 =cut
169
170 sub add_hidden_text {
171 my $self = shift;
172 my $text = shift;
173 return unless defined($text);
174
175 push @{ $self->{htexts} }, $self->_s($text);
176 }
177
178 =head2 id
179
180 Get the ID number of document. If the object has never been registred, C<-1> is returned.
181
182 print $doc->id;
183
184 =cut
185
186 sub id {
187 my $self = shift;
188 return $self->{id};
189 }
190
191 =head2 attr_names
192
193 Returns array with attribute names from document object.
194
195 my @attrs = $doc->attr_names;
196
197 =cut
198
199 sub attr_names {
200 my $self = shift;
201 croak "attr_names return array, not scalar" if (! wantarray);
202 return sort keys %{ $self->{attrs} };
203 }
204
205
206 =head2 attr
207
208 Returns value of an attribute.
209
210 my $value = $doc->attr( 'attribute' );
211
212 =cut
213
214 sub attr {
215 my $self = shift;
216 my $name = shift;
217
218 return $self->{'attrs'}->{ $name };
219 }
220
221
222 =head2 texts
223
224 Returns array with text sentences.
225
226 my @texts = $doc->texts;
227
228 =cut
229
230 sub texts {
231 my $self = shift;
232 confess "texts return array, not scalar" if (! wantarray);
233 return @{ $self->{dtexts} };
234 }
235
236 =head2 cat_texts
237
238 Return whole text as single scalar.
239
240 my $text = $doc->cat_texts;
241
242 =cut
243
244 sub cat_texts {
245 my $self = shift;
246 return join(' ',@{ $self->{dtexts} });
247 }
248
249 =head2 dump_draft
250
251 Dump draft data from document object.
252
253 print $doc->dump_draft;
254
255 =cut
256
257 sub dump_draft {
258 my $self = shift;
259 my $draft;
260
261 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
262 $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
263 }
264
265 if ($self->{kwords}) {
266 $draft .= '%%VECTOR';
267 while (my ($key, $value) = each %{ $self->{kwords} }) {
268 $draft .= "\t$key\t$value";
269 }
270 $draft .= "\n";
271 }
272
273 $draft .= "\n";
274
275 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
276 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
277
278 return $draft;
279 }
280
281 =head2 delete
282
283 Empty document object
284
285 $doc->delete;
286
287 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
288 convinience. Document objects which go out of scope will be destroyed
289 automatically.
290
291 =cut
292
293 sub delete {
294 my $self = shift;
295
296 foreach my $data (qw/attrs dtexts stexts kwords/) {
297 delete($self->{$data});
298 }
299
300 $self->{id} = -1;
301
302 return 1;
303 }
304
305
306
307 package Search::Estraier::Condition;
308
309 use Carp qw/confess croak/;
310
311 use Search::Estraier;
312 our @ISA = qw/Search::Estraier/;
313
314 =head1 Search::Estraier::Condition
315
316 =head2 new
317
318 my $cond = new Search::HyperEstraier::Condition;
319
320 =cut
321
322 sub new {
323 my $class = shift;
324 my $self = {};
325 bless($self, $class);
326
327 $self->{max} = -1;
328 $self->{options} = 0;
329
330 $self ? return $self : return undef;
331 }
332
333 =head2 set_phrase
334
335 $cond->set_phrase('search phrase');
336
337 =cut
338
339 sub set_phrase {
340 my $self = shift;
341 $self->{phrase} = $self->_s( shift );
342 }
343
344 =head2 add_attr
345
346 $cond->add_attr('@URI STRINC /~dpavlin/');
347
348 =cut
349
350 sub add_attr {
351 my $self = shift;
352 my $attr = shift || return;
353 push @{ $self->{attrs} }, $self->_s( $attr );
354 }
355
356 =head2 set_order
357
358 $cond->set_order('@mdate NUMD');
359
360 =cut
361
362 sub set_order {
363 my $self = shift;
364 $self->{order} = shift;
365 }
366
367 =head2 set_max
368
369 $cond->set_max(42);
370
371 =cut
372
373 sub set_max {
374 my $self = shift;
375 my $max = shift;
376 croak "set_max needs number" unless ($max =~ m/^\d+$/);
377 $self->{max} = $max;
378 }
379
380 =head2 set_options
381
382 $cond->set_options( SURE => 1 );
383
384 =cut
385
386 my $options = {
387 # check N-gram keys skipping by three
388 SURE => 1 << 0,
389 # check N-gram keys skipping by two
390 USUAL => 1 << 1,
391 # without TF-IDF tuning
392 FAST => 1 << 2,
393 # with the simplified phrase
394 AGITO => 1 << 3,
395 # check every N-gram key
396 NOIDF => 1 << 4,
397 # check N-gram keys skipping by one
398 SIMPLE => 1 << 10,
399 };
400
401 sub set_options {
402 my $self = shift;
403 my $option = shift;
404 confess "unknown option" unless ($options->{$option});
405 $self->{options} ||= $options->{$option};
406 }
407
408 =head2 phrase
409
410 Return search phrase.
411
412 print $cond->phrase;
413
414 =cut
415
416 sub phrase {
417 my $self = shift;
418 return $self->{phrase};
419 }
420
421 =head2 order
422
423 Return search result order.
424
425 print $cond->order;
426
427 =cut
428
429 sub order {
430 my $self = shift;
431 return $self->{order};
432 }
433
434 =head2 attrs
435
436 Return search result attrs.
437
438 my @cond_attrs = $cond->attrs;
439
440 =cut
441
442 sub attrs {
443 my $self = shift;
444 #croak "attrs return array, not scalar" if (! wantarray);
445 return @{ $self->{attrs} };
446 }
447
448 =head2 max
449
450 Return maximum number of results.
451
452 print $cond->max;
453
454 C<-1> is returned for unitialized value, C<0> is unlimited.
455
456 =cut
457
458 sub max {
459 my $self = shift;
460 return $self->{max};
461 }
462
463 =head2 options
464
465 Return options for this condition.
466
467 print $cond->options;
468
469 Options are returned in numerical form.
470
471 =cut
472
473 sub options {
474 my $self = shift;
475 return $self->{options};
476 }
477
478
479 package Search::Estraier::ResultDocument;
480
481 use Carp qw/croak/;
482
483 #use Search::Estraier;
484 #our @ISA = qw/Search::Estraier/;
485
486 =head1 Search::Estraier::ResultDocument
487
488 =head2 new
489
490 my $rdoc = new Search::HyperEstraier::ResultDocument(
491 uri => 'http://localhost/document/uri/42',
492 attrs => {
493 foo => 1,
494 bar => 2,
495 },
496 snippet => 'this is a text of snippet'
497 keywords => 'this\tare\tkeywords'
498 );
499
500 =cut
501
502 sub new {
503 my $class = shift;
504 my $self = {@_};
505 bless($self, $class);
506
507 foreach my $f (qw/uri attrs snippet keywords/) {
508 croak "missing $f for ResultDocument" unless defined($self->{$f});
509 }
510
511 $self ? return $self : return undef;
512 }
513
514 =head2 uri
515
516 Return URI of result document
517
518 print $rdoc->uri;
519
520 =cut
521
522 sub uri {
523 my $self = shift;
524 return $self->{uri};
525 }
526
527
528 =head2 attr_names
529
530 Returns array with attribute names from result document object.
531
532 my @attrs = $rdoc->attr_names;
533
534 =cut
535
536 sub attr_names {
537 my $self = shift;
538 croak "attr_names return array, not scalar" if (! wantarray);
539 return sort keys %{ $self->{attrs} };
540 }
541
542 =head2 attr
543
544 Returns value of an attribute.
545
546 my $value = $rdoc->attr( 'attribute' );
547
548 =cut
549
550 sub attr {
551 my $self = shift;
552 my $name = shift || return;
553 return $self->{attrs}->{ $name };
554 }
555
556 =head2 snippet
557
558 Return snippet from result document
559
560 print $rdoc->snippet;
561
562 =cut
563
564 sub snippet {
565 my $self = shift;
566 return $self->{snippet};
567 }
568
569 =head2 keywords
570
571 Return keywords from result document
572
573 print $rdoc->keywords;
574
575 =cut
576
577 sub keywords {
578 my $self = shift;
579 return $self->{keywords};
580 }
581
582
583 package Search::Estraier::NodeResult;
584
585 use Carp qw/croak/;
586
587 #use Search::Estraier;
588 #our @ISA = qw/Search::Estraier/;
589
590 =head1 Search::Estraier::NodeResult
591
592 =head2 new
593
594 my $res = new Search::HyperEstraier::NodeResult(
595 docs => @array_of_rdocs,
596 hits => %hash_with_hints,
597 );
598
599 =cut
600
601 sub new {
602 my $class = shift;
603 my $self = {@_};
604 bless($self, $class);
605
606 foreach my $f (qw/docs hints/) {
607 croak "missing $f for ResultDocument" unless defined($self->{$f});
608 }
609
610 $self ? return $self : return undef;
611 }
612
613 =head2 doc_num
614
615 Return number of documents
616
617 print $res->doc_num;
618
619 =cut
620
621 sub doc_num {
622 my $self = shift;
623 return $#{$self->{docs}};
624 }
625
626 =head2 get_doc
627
628 Return single document
629
630 my $doc = $res->get_doc( 42 );
631
632 Returns undef if document doesn't exist.
633
634 =cut
635
636 sub get_doc {
637 my $self = shift;
638 my $num = shift;
639 croak "expect number as argument" unless ($num =~ m/^\d+$/);
640 return undef if ($num < 0 || $num > $self->{docs});
641 return $self->{docs}->[$num];
642 }
643
644 =head2 hint
645
646 Return specific hint from results.
647
648 print $rec->hint( 'VERSION' );
649
650 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
651 C<TIME>, C<LINK#n>, C<VIEW>.
652
653 =cut
654
655 sub hint {
656 my $self = shift;
657 my $key = shift || return;
658 return $self->{hints}->{$key};
659 }
660
661
662 package Search::Estraier::Node;
663
664 use Carp qw/carp croak/;
665 use URI;
666 use MIME::Base64;
667 use IO::Socket::INET;
668
669 =head1 Search::Estraier::Node
670
671 =head2 new
672
673 my $node = new Search::HyperEstraier::Node;
674
675 =cut
676
677 sub new {
678 my $class = shift;
679 my $self = {
680 pxport => -1,
681 timeout => 0, # this used to be -1
682 dnum => -1,
683 wnum => -1,
684 size => -1.0,
685 wwidth => 480,
686 hwidth => 96,
687 awidth => 96,
688 status => -1,
689 };
690 bless($self, $class);
691
692 if (@_) {
693 $self->{debug} = shift;
694 warn "## Node debug on\n";
695 }
696
697 $self ? return $self : return undef;
698 }
699
700 =head2 set_url
701
702 Specify URL to node server
703
704 $node->set_url('http://localhost:1978');
705
706 =cut
707
708 sub set_url {
709 my $self = shift;
710 $self->{url} = shift;
711 }
712
713 =head2 set_proxy
714
715 Specify proxy server to connect to node server
716
717 $node->set_proxy('proxy.example.com', 8080);
718
719 =cut
720
721 sub set_proxy {
722 my $self = shift;
723 my ($host,$port) = @_;
724 croak "proxy port must be number" unless ($port =~ m/^\d+$/);
725 $self->{pxhost} = $host;
726 $self->{pxport} = $port;
727 }
728
729 =head2 set_timeout
730
731 Specify timeout of connection in seconds
732
733 $node->set_timeout( 15 );
734
735 =cut
736
737 sub set_timeout {
738 my $self = shift;
739 my $sec = shift;
740 croak "timeout must be number" unless ($sec =~ m/^\d+$/);
741 $self->{timeout} = $sec;
742 }
743
744 =head2 set_auth
745
746 Specify name and password for authentication to node server.
747
748 $node->set_auth('clint','eastwood');
749
750 =cut
751
752 sub set_auth {
753 my $self = shift;
754 my ($login,$passwd) = @_;
755 my $basic_auth = encode_base64( "$login:$passwd" );
756 chomp($basic_auth);
757 $self->{auth} = $basic_auth;
758 }
759
760 =head2 status
761
762 Return status code of last request.
763
764 print $node->status;
765
766 C<-1> means connection failure.
767
768 =cut
769
770 sub status {
771 my $self = shift;
772 return $self->{status};
773 }
774
775 =head2 put_doc
776
777 Add a document
778
779 $node->put_doc( $document_draft ) or die "can't add document";
780
781 Return true on success or false on failture.
782
783 =cut
784
785 sub put_doc {
786 my $self = shift;
787 my $doc = shift || return;
788 return unless ($self->{url});
789 $self->shuttle_url( $self->{url} . '/put_doc',
790 'text/x-estraier-draft',
791 $doc->dump_draft,
792 undef
793 ) == 200;
794 }
795
796
797 =head2 out_doc
798
799 Remove a document
800
801 $node->out_doc( document_id ) or "can't remove document";
802
803 Return true on success or false on failture.
804
805 =cut
806
807 sub out_doc {
808 my $self = shift;
809 my $id = shift || return;
810 return unless ($self->{url});
811 croak "id must be number" unless ($id =~ m/^\d+$/);
812 $self->shuttle_url( $self->{url} . '/out_doc',
813 'application/x-www-form-urlencoded',
814 "id=$id",
815 undef
816 ) == 200;
817 }
818
819
820 =head2 out_doc_by_uri
821
822 Remove a registrated document using it's uri
823
824 $node->out_doc_by_uri( 'file:///document_url' ) or "can't remove document";
825
826 Return true on success or false on failture.
827
828 =cut
829
830 sub out_doc_by_uri {
831 my $self = shift;
832 my $uri = shift || return;
833 return unless ($self->{url});
834 $self->shuttle_url( $self->{url} . '/out_doc',
835 'application/x-www-form-urlencoded',
836 "uri=$uri",
837 undef
838 ) == 200;
839 }
840
841 =head2 shuttle_url
842
843 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
844 master.
845
846 my $rv = shuttle_url( $url, $content_type, \$req_body, \$resbody );
847
848 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
849 body will be saved within object.
850
851 =cut
852
853 sub shuttle_url {
854 my $self = shift;
855
856 my ($url, $content_type, $reqbody, $resbody) = @_;
857
858 $self->{status} = -1;
859
860 warn "## $url\n" if ($self->{debug});
861
862 $url = new URI($url);
863 if (
864 !$url || !$url->scheme || !$url->scheme eq 'http' ||
865 !$url->host || !$url->port || $url->port < 1
866 ) {
867 carp "can't parse $url\n";
868 return -1;
869 }
870
871 my ($host,$port,$query) = ($url->host, $url->port, $url->path);
872
873 if ($self->{pxhost}) {
874 ($host,$port) = ($self->{pxhost}, $self->{pxport});
875 $query = "http://$host:$port/$query";
876 }
877
878 $query .= '?' . $url->query if ($url->query && ! $reqbody);
879
880 my $headers;
881
882 if ($reqbody) {
883 $headers .= "POST $query HTTP/1.0\r\n";
884 } else {
885 $headers .= "GET $query HTTP/1.0\r\n";
886 }
887
888 $headers .= "Host: " . $url->host . ":" . $url->port . "\r\n";
889 $headers .= "Connection: close\r\n";
890 $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";
891 $headers .= "Content-Type: $content_type\r\n";
892 $headers .= "Authorization: Basic $self->{auth}\r\n";
893 my $len = 0;
894 {
895 use bytes;
896 $len = length($reqbody) if ($reqbody);
897 }
898 $headers .= "Content-Length: $len\r\n";
899 $headers .= "\r\n";
900
901 my $sock = IO::Socket::INET->new(
902 PeerAddr => $host,
903 PeerPort => $port,
904 Proto => 'tcp',
905 Timeout => $self->{timeout} || 90,
906 );
907
908 if (! $sock) {
909 carp "can't open socket to $host:$port";
910 return -1;
911 }
912
913 warn $headers if ($self->{debug});
914
915 print $sock $headers or
916 carp "can't send headers to network:\n$headers\n" and return -1;
917
918 if ($reqbody) {
919 warn "$reqbody\n" if ($self->{debug});
920 print $sock $reqbody or
921 carp "can't send request body to network:\n$$reqbody\n" and return -1;
922 }
923
924 my $line = <$sock>;
925 chomp($line);
926 my ($schema, $res_status, undef) = split(/ */, $line, 3);
927 return if ($schema !~ /^HTTP/ || ! $res_status);
928
929 $self->{status} = $res_status;
930 warn "## response status: $res_status\n" if ($self->{debug});
931
932 # skip rest of headers
933 $line = <$sock>;
934 while ($line) {
935 $line = <$sock>;
936 $line =~ s/[\r\n]+$//;
937 warn "## ", $line || 'NULL', " ##\n" if ($self->{debug});
938 };
939
940 # read body
941 $len = 0;
942 do {
943 $len = read($sock, my $buf, 8192);
944 $$resbody .= $buf if ($resbody);
945 } while ($len);
946
947 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
948
949 return $self->{status};
950 }
951
952 ###
953
954 =head1 EXPORT
955
956 Nothing.
957
958 =head1 SEE ALSO
959
960 L<http://hyperestraier.sourceforge.net/>
961
962 Hyper Estraier Ruby interface on which this module is based.
963
964 =head1 AUTHOR
965
966 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
967
968
969 =head1 COPYRIGHT AND LICENSE
970
971 Copyright (C) 2005-2006 by Dobrica Pavlinusic
972
973 This library is free software; you can redistribute it and/or modify
974 it under the GPL v2 or later.
975
976 =cut
977
978 1;

  ViewVC Help
Powered by ViewVC 1.1.26