/[Search-Estraier]/trunk/lib/Search/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Search/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 37 - (hide annotations)
Thu Jan 5 22:16:21 2006 UTC (18 years, 2 months ago) by dpavlin
Original Path: trunk/Estraier.pm
File size: 14607 byte(s)
much better error messages
1 dpavlin 2 package Search::Estraier;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     =head1 NAME
10    
11     Search::Estraier - pure perl module to use Hyper Estraier search engine
12    
13     =head1 SYNOPSIS
14    
15     use Search::Estraier;
16     my $est = new Search::Estraier();
17    
18     =head1 DESCRIPTION
19    
20     This module is implementation of node API of Hyper Estraier. Since it's
21     perl-only module with dependencies only on standard perl modules, it will
22     run on all platforms on which perl runs. It doesn't require compilation
23     or Hyper Estraier development files on target machine.
24    
25     It is implemented as multiple packages which closly resamble Ruby
26     implementation. It also includes methods to manage nodes.
27    
28     =cut
29    
30 dpavlin 15 =head2 _s
31    
32     Remove multiple whitespaces from string, as well as whitespaces at beginning or end
33    
34     my $text = $self->_s(" this is a text ");
35     $text = 'this is a text';
36    
37     =cut
38    
39     sub _s {
40     my $text = $_[1] || return;
41     $text =~ s/\s\s+/ /gs;
42     $text =~ s/^\s+//;
43     $text =~ s/\s+$//;
44     return $text;
45     }
46    
47 dpavlin 2 package Search::Estraier::Document;
48    
49 dpavlin 9 use Carp qw/croak confess/;
50 dpavlin 7
51 dpavlin 15 use Search::Estraier;
52     our @ISA = qw/Search::Estraier/;
53    
54 dpavlin 2 =head1 Search::Estraier::Document
55    
56 dpavlin 14 This class implements Document which is collection of attributes
57     (key=value), vectors (also key value) display text and hidden text.
58    
59 dpavlin 2 =head2 new
60    
61 dpavlin 14 Create new document, empty or from draft.
62    
63 dpavlin 2 my $doc = new Search::HyperEstraier::Document;
64 dpavlin 14 my $doc2 = new Search::HyperEstraier::Document( $draft );
65 dpavlin 2
66     =cut
67    
68     sub new {
69     my $class = shift;
70 dpavlin 14 my $self = {};
71 dpavlin 2 bless($self, $class);
72    
73 dpavlin 6 $self->{id} = -1;
74    
75 dpavlin 14 my $draft = shift;
76    
77     if ($draft) {
78     my $in_text = 0;
79     foreach my $line (split(/\n/, $draft)) {
80    
81     if ($in_text) {
82     if ($line =~ /^\t/) {
83     push @{ $self->{htexts} }, substr($line, 1);
84     } else {
85     push @{ $self->{dtexts} }, $line;
86     }
87     next;
88     }
89    
90     if ($line =~ m/^%VECTOR\t(.+)$/) {
91     my @fields = split(/\t/, $1);
92     for my $i ( 0 .. ($#fields - 1) ) {
93     $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
94     $i++;
95     }
96     next;
97     } elsif ($line =~ m/^%/) {
98     # What is this? comment?
99     #warn "$line\n";
100     next;
101     } elsif ($line =~ m/^$/) {
102     $in_text = 1;
103     next;
104     } elsif ($line =~ m/^(.+)=(.+)$/) {
105     $self->{attrs}->{ $1 } = $2;
106     next;
107     }
108    
109     warn "draft ignored: $line\n";
110     }
111     }
112    
113 dpavlin 2 $self ? return $self : return undef;
114     }
115    
116 dpavlin 4
117 dpavlin 2 =head2 add_attr
118    
119 dpavlin 6 Add an attribute.
120    
121 dpavlin 2 $doc->add_attr( name => 'value' );
122    
123 dpavlin 9 Delete attribute using
124 dpavlin 5
125     $doc->add_attr( name => undef );
126    
127 dpavlin 2 =cut
128    
129     sub add_attr {
130     my $self = shift;
131     my $attrs = {@_};
132    
133     while (my ($name, $value) = each %{ $attrs }) {
134 dpavlin 9 if (! defined($value)) {
135 dpavlin 15 delete( $self->{attrs}->{ $self->_s($name) } );
136 dpavlin 9 } else {
137 dpavlin 15 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
138 dpavlin 9 }
139 dpavlin 2 }
140 dpavlin 8
141     return 1;
142 dpavlin 2 }
143    
144 dpavlin 5
145     =head2 add_text
146    
147 dpavlin 6 Add a sentence of text.
148    
149 dpavlin 5 $doc->add_text('this is example text to display');
150    
151     =cut
152    
153     sub add_text {
154     my $self = shift;
155     my $text = shift;
156     return unless defined($text);
157    
158 dpavlin 15 push @{ $self->{dtexts} }, $self->_s($text);
159 dpavlin 5 }
160    
161    
162     =head2 add_hidden_text
163    
164 dpavlin 6 Add a hidden sentence.
165    
166 dpavlin 5 $doc->add_hidden_text('this is example text just for search');
167    
168     =cut
169    
170     sub add_hidden_text {
171     my $self = shift;
172     my $text = shift;
173     return unless defined($text);
174    
175 dpavlin 15 push @{ $self->{htexts} }, $self->_s($text);
176 dpavlin 5 }
177    
178 dpavlin 6 =head2 id
179    
180     Get the ID number of document. If the object has never been registred, C<-1> is returned.
181    
182     print $doc->id;
183    
184     =cut
185    
186     sub id {
187     my $self = shift;
188     return $self->{id};
189     }
190    
191 dpavlin 7 =head2 attr_names
192    
193 dpavlin 9 Returns array with attribute names from document object.
194 dpavlin 7
195     my @attrs = $doc->attr_names;
196    
197     =cut
198    
199     sub attr_names {
200     my $self = shift;
201 dpavlin 9 croak "attr_names return array, not scalar" if (! wantarray);
202 dpavlin 7 return sort keys %{ $self->{attrs} };
203     }
204    
205 dpavlin 8
206     =head2 attr
207    
208 dpavlin 9 Returns value of an attribute.
209 dpavlin 8
210     my $value = $doc->attr( 'attribute' );
211    
212     =cut
213    
214     sub attr {
215     my $self = shift;
216     my $name = shift;
217    
218     return $self->{'attrs'}->{ $name };
219     }
220    
221 dpavlin 9
222     =head2 texts
223    
224     Returns array with text sentences.
225    
226     my @texts = $doc->texts;
227    
228     =cut
229    
230     sub texts {
231     my $self = shift;
232 dpavlin 12 confess "texts return array, not scalar" if (! wantarray);
233 dpavlin 11 return @{ $self->{dtexts} };
234 dpavlin 9 }
235    
236 dpavlin 12 =head2 cat_texts
237    
238     Return whole text as single scalar.
239    
240     my $text = $doc->cat_texts;
241    
242     =cut
243    
244     sub cat_texts {
245     my $self = shift;
246     return join(' ',@{ $self->{dtexts} });
247     }
248    
249 dpavlin 5 =head2 dump_draft
250    
251 dpavlin 13 Dump draft data from document object.
252    
253 dpavlin 5 print $doc->dump_draft;
254    
255     =cut
256    
257     sub dump_draft {
258 dpavlin 13 my $self = shift;
259     my $draft;
260    
261     foreach my $attr_name (sort keys %{ $self->{attrs} }) {
262     $draft .= $attr_name . '=' . $self->{attrs}->{$attr_name} . "\n";
263     }
264    
265     if ($self->{kwords}) {
266     $draft .= '%%VECTOR';
267     while (my ($key, $value) = each %{ $self->{kwords} }) {
268     $draft .= "\t$key\t$value";
269     }
270     $draft .= "\n";
271     }
272    
273     $draft .= "\n";
274    
275     $draft .= join("\n", @{ $self->{dtexts} }) . "\n";
276     $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n";
277    
278     return $draft;
279 dpavlin 5 }
280    
281 dpavlin 4 =head2 delete
282 dpavlin 2
283 dpavlin 4 Empty document object
284 dpavlin 2
285 dpavlin 4 $doc->delete;
286    
287 dpavlin 15 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
288     convinience. Document objects which go out of scope will be destroyed
289     automatically.
290    
291 dpavlin 4 =cut
292    
293     sub delete {
294     my $self = shift;
295    
296 dpavlin 14 foreach my $data (qw/attrs dtexts stexts kwords/) {
297 dpavlin 5 delete($self->{$data});
298     }
299 dpavlin 4
300 dpavlin 10 $self->{id} = -1;
301    
302 dpavlin 4 return 1;
303     }
304    
305    
306    
307 dpavlin 15 package Search::Estraier::Condition;
308 dpavlin 4
309 dpavlin 16 use Carp qw/confess croak/;
310    
311 dpavlin 15 use Search::Estraier;
312     our @ISA = qw/Search::Estraier/;
313 dpavlin 4
314 dpavlin 16 =head1 Search::Estraier::Condition
315    
316     =head2 new
317    
318     my $cond = new Search::HyperEstraier::Condition;
319    
320     =cut
321    
322     sub new {
323     my $class = shift;
324     my $self = {};
325     bless($self, $class);
326    
327 dpavlin 19 $self->{max} = -1;
328     $self->{options} = 0;
329    
330 dpavlin 16 $self ? return $self : return undef;
331     }
332    
333     =head2 set_phrase
334    
335     $cond->set_phrase('search phrase');
336    
337     =cut
338    
339     sub set_phrase {
340     my $self = shift;
341     $self->{phrase} = $self->_s( shift );
342     }
343    
344     =head2 add_attr
345    
346     $cond->add_attr('@URI STRINC /~dpavlin/');
347    
348     =cut
349    
350     sub add_attr {
351     my $self = shift;
352     my $attr = shift || return;
353     push @{ $self->{attrs} }, $self->_s( $attr );
354     }
355    
356     =head2 set_order
357    
358     $cond->set_order('@mdate NUMD');
359    
360     =cut
361    
362     sub set_order {
363     my $self = shift;
364     $self->{order} = shift;
365     }
366    
367     =head2 set_max
368    
369     $cond->set_max(42);
370    
371     =cut
372    
373     sub set_max {
374     my $self = shift;
375     my $max = shift;
376     croak "set_max needs number" unless ($max =~ m/^\d+$/);
377     $self->{max} = $max;
378     }
379    
380     =head2 set_options
381    
382     $cond->set_options( SURE => 1 );
383    
384     =cut
385    
386 dpavlin 15 my $options = {
387     # check N-gram keys skipping by three
388     SURE => 1 << 0,
389     # check N-gram keys skipping by two
390     USUAL => 1 << 1,
391     # without TF-IDF tuning
392     FAST => 1 << 2,
393     # with the simplified phrase
394     AGITO => 1 << 3,
395     # check every N-gram key
396     NOIDF => 1 << 4,
397     # check N-gram keys skipping by one
398     SIMPLE => 1 << 10,
399     };
400    
401 dpavlin 16 sub set_options {
402     my $self = shift;
403     my $option = shift;
404     confess "unknown option" unless ($options->{$option});
405     $self->{options} ||= $options->{$option};
406 dpavlin 4 }
407    
408 dpavlin 18 =head2 phrase
409    
410     Return search phrase.
411    
412     print $cond->phrase;
413    
414     =cut
415    
416     sub phrase {
417     my $self = shift;
418     return $self->{phrase};
419     }
420    
421 dpavlin 19 =head2 order
422 dpavlin 18
423 dpavlin 19 Return search result order.
424    
425     print $cond->order;
426    
427     =cut
428    
429     sub order {
430     my $self = shift;
431     return $self->{order};
432     }
433    
434     =head2 attrs
435    
436     Return search result attrs.
437    
438     my @cond_attrs = $cond->attrs;
439    
440     =cut
441    
442     sub attrs {
443     my $self = shift;
444     #croak "attrs return array, not scalar" if (! wantarray);
445     return @{ $self->{attrs} };
446     }
447    
448     =head2 max
449    
450     Return maximum number of results.
451    
452     print $cond->max;
453    
454     C<-1> is returned for unitialized value, C<0> is unlimited.
455    
456     =cut
457    
458     sub max {
459     my $self = shift;
460     return $self->{max};
461     }
462    
463     =head2 options
464    
465     Return options for this condition.
466    
467     print $cond->options;
468    
469     Options are returned in numerical form.
470    
471     =cut
472    
473     sub options {
474     my $self = shift;
475     return $self->{options};
476     }
477    
478    
479 dpavlin 20 package Search::Estraier::ResultDocument;
480    
481 dpavlin 24 use Carp qw/croak/;
482 dpavlin 20
483 dpavlin 24 #use Search::Estraier;
484     #our @ISA = qw/Search::Estraier/;
485 dpavlin 20
486     =head1 Search::Estraier::ResultDocument
487    
488     =head2 new
489    
490 dpavlin 23 my $rdoc = new Search::HyperEstraier::ResultDocument(
491 dpavlin 20 uri => 'http://localhost/document/uri/42',
492     attrs => {
493     foo => 1,
494     bar => 2,
495     },
496     snippet => 'this is a text of snippet'
497     keywords => 'this\tare\tkeywords'
498     );
499    
500     =cut
501    
502     sub new {
503     my $class = shift;
504     my $self = {@_};
505     bless($self, $class);
506    
507     foreach my $f (qw/uri attrs snippet keywords/) {
508     croak "missing $f for ResultDocument" unless defined($self->{$f});
509     }
510    
511     $self ? return $self : return undef;
512     }
513    
514 dpavlin 23 =head2 uri
515 dpavlin 20
516 dpavlin 23 Return URI of result document
517 dpavlin 20
518 dpavlin 23 print $rdoc->uri;
519    
520     =cut
521    
522     sub uri {
523     my $self = shift;
524     return $self->{uri};
525     }
526    
527    
528     =head2 attr_names
529    
530     Returns array with attribute names from result document object.
531    
532     my @attrs = $rdoc->attr_names;
533    
534     =cut
535    
536     sub attr_names {
537     my $self = shift;
538     croak "attr_names return array, not scalar" if (! wantarray);
539     return sort keys %{ $self->{attrs} };
540     }
541    
542     =head2 attr
543    
544     Returns value of an attribute.
545    
546     my $value = $rdoc->attr( 'attribute' );
547    
548     =cut
549    
550     sub attr {
551     my $self = shift;
552     my $name = shift || return;
553     return $self->{attrs}->{ $name };
554     }
555    
556     =head2 snippet
557    
558     Return snippet from result document
559    
560     print $rdoc->snippet;
561    
562     =cut
563    
564     sub snippet {
565     my $self = shift;
566     return $self->{snippet};
567     }
568    
569     =head2 keywords
570    
571     Return keywords from result document
572    
573     print $rdoc->keywords;
574    
575     =cut
576    
577     sub keywords {
578     my $self = shift;
579     return $self->{keywords};
580     }
581    
582    
583 dpavlin 25 package Search::Estraier::NodeResult;
584    
585     use Carp qw/croak/;
586    
587     #use Search::Estraier;
588     #our @ISA = qw/Search::Estraier/;
589    
590     =head1 Search::Estraier::NodeResult
591    
592     =head2 new
593    
594     my $res = new Search::HyperEstraier::NodeResult(
595     docs => @array_of_rdocs,
596     hits => %hash_with_hints,
597     );
598    
599     =cut
600    
601     sub new {
602     my $class = shift;
603     my $self = {@_};
604     bless($self, $class);
605    
606     foreach my $f (qw/docs hints/) {
607     croak "missing $f for ResultDocument" unless defined($self->{$f});
608     }
609    
610     $self ? return $self : return undef;
611     }
612    
613     =head2 doc_num
614    
615     Return number of documents
616    
617     print $res->doc_num;
618    
619     =cut
620    
621     sub doc_num {
622     my $self = shift;
623     return $#{$self->{docs}};
624     }
625    
626     =head2 get_doc
627    
628     Return single document
629    
630     my $doc = $res->get_doc( 42 );
631    
632     Returns undef if document doesn't exist.
633    
634     =cut
635    
636     sub get_doc {
637     my $self = shift;
638     my $num = shift;
639     croak "expect number as argument" unless ($num =~ m/^\d+$/);
640     return undef if ($num < 0 || $num > $self->{docs});
641     return $self->{docs}->[$num];
642     }
643    
644     =head2 hint
645    
646     Return specific hint from results.
647    
648     print $rec->hint( 'VERSION' );
649    
650     Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
651     C<TIME>, C<LINK#n>, C<VIEW>.
652    
653     =cut
654    
655     sub hint {
656     my $self = shift;
657     my $key = shift || return;
658     return $self->{hints}->{$key};
659     }
660    
661    
662 dpavlin 27 package Search::Estraier::Node;
663    
664 dpavlin 37 use Carp qw/carp croak/;
665 dpavlin 33 use URI;
666 dpavlin 36 use MIME::Base64;
667 dpavlin 33 use IO::Socket::INET;
668 dpavlin 29
669 dpavlin 27 =head1 Search::Estraier::Node
670    
671     =head2 new
672    
673     my $node = new Search::HyperEstraier::Node;
674    
675     =cut
676    
677     sub new {
678     my $class = shift;
679     my $self = {
680     pxport => -1,
681 dpavlin 33 timeout => 0, # this used to be -1
682 dpavlin 27 dnum => -1,
683     wnum => -1,
684     size => -1.0,
685     wwidth => 480,
686     hwidth => 96,
687     awidth => 96,
688     status => -1,
689     };
690     bless($self, $class);
691    
692     $self ? return $self : return undef;
693     }
694    
695 dpavlin 29 =head2 set_url
696    
697     Specify URL to node server
698    
699     $node->set_url('http://localhost:1978');
700    
701     =cut
702    
703     sub set_url {
704     my $self = shift;
705     $self->{url} = shift;
706     }
707    
708     =head2 set_proxy
709    
710     Specify proxy server to connect to node server
711    
712     $node->set_proxy('proxy.example.com', 8080);
713    
714     =cut
715    
716     sub set_proxy {
717     my $self = shift;
718     my ($host,$port) = @_;
719     croak "proxy port must be number" unless ($port =~ m/^\d+$/);
720     $self->{pxhost} = $host;
721     $self->{pxport} = $port;
722     }
723    
724 dpavlin 30 =head2 set_timeout
725    
726     Specify timeout of connection in seconds
727    
728     $node->set_timeout( 15 );
729    
730     =cut
731    
732     sub set_timeout {
733     my $self = shift;
734     my $sec = shift;
735     croak "timeout must be number" unless ($sec =~ m/^\d+$/);
736     $self->{timeout} = $sec;
737     }
738    
739 dpavlin 31 =head2 set_auth
740    
741     Specify name and password for authentication to node server.
742    
743     $node->set_auth('clint','eastwood');
744    
745     =cut
746    
747     sub set_auth {
748     my $self = shift;
749     my ($login,$passwd) = @_;
750 dpavlin 36 $self->{auth} = encode_base64( "$login:$passwd" );
751 dpavlin 31 }
752    
753 dpavlin 32 =head2 status
754    
755     Return status code of last request.
756    
757     print $res->status;
758    
759     C<-1> means connection failure.
760    
761     =cut
762    
763     sub status {
764     my $self = shift;
765     return $self->{status};
766     }
767    
768 dpavlin 33 =head2 shuttle_url
769 dpavlin 32
770 dpavlin 33 This is method which uses C<IO::Socket::INET> to communicate with Hyper Estraier node
771     master.
772 dpavlin 2
773 dpavlin 33 my $rv = shuttle_url( $url, $content_type, \$req_body, \$resbody );
774 dpavlin 2
775 dpavlin 33 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
776     body will be saved within object.
777 dpavlin 2
778     =cut
779    
780 dpavlin 33 sub shuttle_url {
781     my $self = shift;
782 dpavlin 2
783 dpavlin 33 my ($url, $content_type, $reqbody, $resbody) = @_;
784 dpavlin 2
785 dpavlin 33 my $status = -1;
786    
787 dpavlin 36 warn $url;
788    
789 dpavlin 33 $url = new URI($url);
790 dpavlin 37 if (
791     !$url || !$url->scheme || !$url->scheme eq 'http' ||
792     !$url->host || !$url->port || $url->port < 1
793     ) {
794     carp "can't parse $url\n";
795     return -1;
796     }
797 dpavlin 33
798     my ($host,$port,$query) = ($url->host, $url->port, $url->path);
799    
800     if ($self->{pxhost}) {
801     ($host,$port) = ($self->{pxhost}, $self->{pxport});
802     $query = "http://$host:$port/$query";
803 dpavlin 2 }
804    
805 dpavlin 37 $query .= '?' . $url->query if ($url->query && ! $reqbody);
806 dpavlin 2
807 dpavlin 37 my $headers;
808    
809     if ($reqbody) {
810     $headers .= "POST $query HTTP/1.0\r\n";
811     } else {
812     $headers .= "GET $query HTTP/1.0\r\n";
813     }
814    
815     $headers .= "Host: $url->host:$url->port\r\n";
816     $headers .= "Connection: close\r\n";
817     $headers .= "User-Agent: Search-Estraier/$Search::Estraier::VERSION\r\n";
818     $headers .= "Content-Type $content_type\r\n";
819     $headers .= "Authorization: Basic $self->{auth}\r\n";
820     my $len = 0;
821     {
822     use bytes;
823     $len = length($reqbody) if ($reqbody);
824     }
825     $headers .= "Content-Length: $len\r\n";
826     $headers .= "\r\n";
827    
828 dpavlin 33 my $sock = IO::Socket::INET->new(
829     PeerAddr => $host,
830     PeerPort => $port,
831     Proto => 'tcp',
832     Timeout => $self->{timeout} || 90,
833 dpavlin 37 );
834 dpavlin 2
835 dpavlin 37 if (! $sock) {
836     carp "can't open socket to $host:$port";
837     return -1;
838 dpavlin 33 }
839 dpavlin 2
840 dpavlin 37 print $sock $headers or
841     carp "can't send headers to network:\n$headers\n" and return -1;
842    
843     if ($reqbody) {
844     print $sock $$reqbody or
845     carp "can't send request body to network:\n$$reqbody\n" and return -1;
846 dpavlin 33 }
847 dpavlin 2
848 dpavlin 33 my $line = <$sock>;
849     chomp($line);
850     my ($schema, $res_status, undef) = split(/ */, $line, 3);
851     return if ($schema !~ /^HTTP/ || ! $res_status);
852 dpavlin 2
853 dpavlin 33 $self->{status} = $res_status;
854 dpavlin 2
855 dpavlin 33 # skip rest of headers
856     do {
857     $line = <$sock>;
858     chomp($line);
859     } until ($line eq '');
860 dpavlin 2
861 dpavlin 33 # read body
862     my $len = 0;
863     do {
864     $len = read($sock, my $buf, 8192);
865     $$resbody .= $buf if ($resbody);
866     } while ($len);
867    
868     return $status;
869 dpavlin 2 }
870    
871     ###
872    
873     =head1 EXPORT
874    
875     Nothing.
876    
877     =head1 SEE ALSO
878    
879     L<http://hyperestraier.sourceforge.net/>
880    
881     Hyper Estraier Ruby interface on which this module is based.
882    
883     =head1 AUTHOR
884    
885     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
886    
887    
888     =head1 COPYRIGHT AND LICENSE
889    
890 dpavlin 15 Copyright (C) 2005-2006 by Dobrica Pavlinusic
891 dpavlin 2
892     This library is free software; you can redistribute it and/or modify
893     it under the GPL v2 or later.
894    
895     =cut
896    
897     1;

  ViewVC Help
Powered by ViewVC 1.1.26