/[Search-Estraier]/trunk/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 93 - (show annotations)
Sat Jan 28 16:43:45 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 29562 byte(s)
Hyper Estraier 1.0.6 doesn't like attributes with no value (undef in perl),
so we skip them in dump_draft
1 package Search::Estraier;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.04_1';
8
9 =head1 NAME
10
11 Search::Estraier - pure perl module to use Hyper Estraier search engine
12
13 =head1 SYNOPSIS
14
15 =head2 Simple indexer
16
17 use Search::Estraier;
18
19 # create and configure node
20 my $node = new Search::Estraier::Node;
21 $node->set_url("http://localhost:1978/node/test");
22 $node->set_auth("admin","admin");
23
24 # create document
25 my $doc = new Search::Estraier::Document;
26
27 # add attributes
28 $doc->add_attr('@uri', "http://estraier.gov/example.txt");
29 $doc->add_attr('@title', "Over the Rainbow");
30
31 # add body text to document
32 $doc->add_text("Somewhere over the rainbow. Way up high.");
33 $doc->add_text("There's a land that I heard of once in a lullaby.");
34
35 die "error: ", $node->status,"\n" unless ($node->put_doc($doc));
36
37 =head2 Simple searcher
38
39 use Search::Estraier;
40
41 # create and configure node
42 my $node = new Search::Estraier::Node;
43 $node->set_url("http://localhost:1978/node/test");
44 $node->set_auth("admin","admin");
45
46 # create condition
47 my $cond = new Search::Estraier::Condition;
48
49 # set search phrase
50 $cond->set_phrase("rainbow AND lullaby");
51
52 my $nres = $node->search($cond, 0);
53 if (defined($nres)) {
54 # for each document in results
55 for my $i ( 0 ... $nres->doc_num - 1 ) {
56 # get result document
57 my $rdoc = $nres->get_doc($i);
58 # display attribte
59 print "URI: ", $rdoc->attr('@uri'),"\n";
60 print "Title: ", $rdoc->attr('@title'),"\n";
61 print $rdoc->snippet,"\n";
62 }
63 } else {
64 die "error: ", $node->status,"\n";
65 }
66
67 =head1 DESCRIPTION
68
69 This module is implementation of node API of Hyper Estraier. Since it's
70 perl-only module with dependencies only on standard perl modules, it will
71 run on all platforms on which perl runs. It doesn't require compilation
72 or Hyper Estraier development files on target machine.
73
74 It is implemented as multiple packages which closly resamble Ruby
75 implementation. It also includes methods to manage nodes.
76
77 There are few examples in C<scripts> directory of this distribution.
78
79 =cut
80
81 =head1 Inheritable common methods
82
83 This methods should really move somewhere else.
84
85 =head2 _s
86
87 Remove multiple whitespaces from string, as well as whitespaces at beginning or end
88
89 my $text = $self->_s(" this is a text ");
90 $text = 'this is a text';
91
92 =cut
93
94 sub _s {
95 my $text = $_[1] || return;
96 $text =~ s/\s\s+/ /gs;
97 $text =~ s/^\s+//;
98 $text =~ s/\s+$//;
99 return $text;
100 }
101
102 package Search::Estraier::Document;
103
104 use Carp qw/croak confess/;
105
106 use Search::Estraier;
107 our @ISA = qw/Search::Estraier/;
108
109 =head1 Search::Estraier::Document
110
111 This class implements Document which is collection of attributes
112 (key=value), vectors (also key value) display text and hidden text.
113
114
115 =head2 new
116
117 Create new document, empty or from draft.
118
119 my $doc = new Search::HyperEstraier::Document;
120 my $doc2 = new Search::HyperEstraier::Document( $draft );
121
122 =cut
123
124 sub new {
125 my $class = shift;
126 my $self = {};
127 bless($self, $class);
128
129 $self->{id} = -1;
130
131 my $draft = shift;
132
133 if ($draft) {
134 my $in_text = 0;
135 foreach my $line (split(/\n/, $draft)) {
136
137 if ($in_text) {
138 if ($line =~ /^\t/) {
139 push @{ $self->{htexts} }, substr($line, 1);
140 } else {
141 push @{ $self->{dtexts} }, $line;
142 }
143 next;
144 }
145
146 if ($line =~ m/^%VECTOR\t(.+)$/) {
147 my @fields = split(/\t/, $1);
148 for my $i ( 0 .. ($#fields - 1) ) {
149 $self->{kwords}->{ $fields[ $i ] } = $fields[ $i + 1 ];
150 $i++;
151 }
152 next;
153 } elsif ($line =~ m/^%/) {
154 # What is this? comment?
155 #warn "$line\n";
156 next;
157 } elsif ($line =~ m/^$/) {
158 $in_text = 1;
159 next;
160 } elsif ($line =~ m/^(.+)=(.*)$/) {
161 $self->{attrs}->{ $1 } = $2;
162 next;
163 }
164
165 warn "draft ignored: '$line'\n";
166 }
167 }
168
169 $self ? return $self : return undef;
170 }
171
172
173 =head2 add_attr
174
175 Add an attribute.
176
177 $doc->add_attr( name => 'value' );
178
179 Delete attribute using
180
181 $doc->add_attr( name => undef );
182
183 =cut
184
185 sub add_attr {
186 my $self = shift;
187 my $attrs = {@_};
188
189 while (my ($name, $value) = each %{ $attrs }) {
190 if (! defined($value)) {
191 delete( $self->{attrs}->{ $self->_s($name) } );
192 } else {
193 $self->{attrs}->{ $self->_s($name) } = $self->_s($value);
194 }
195 }
196
197 return 1;
198 }
199
200
201 =head2 add_text
202
203 Add a sentence of text.
204
205 $doc->add_text('this is example text to display');
206
207 =cut
208
209 sub add_text {
210 my $self = shift;
211 my $text = shift;
212 return unless defined($text);
213
214 push @{ $self->{dtexts} }, $self->_s($text);
215 }
216
217
218 =head2 add_hidden_text
219
220 Add a hidden sentence.
221
222 $doc->add_hidden_text('this is example text just for search');
223
224 =cut
225
226 sub add_hidden_text {
227 my $self = shift;
228 my $text = shift;
229 return unless defined($text);
230
231 push @{ $self->{htexts} }, $self->_s($text);
232 }
233
234
235 =head2 id
236
237 Get the ID number of document. If the object has never been registred, C<-1> is returned.
238
239 print $doc->id;
240
241 =cut
242
243 sub id {
244 my $self = shift;
245 return $self->{id};
246 }
247
248
249 =head2 attr_names
250
251 Returns array with attribute names from document object.
252
253 my @attrs = $doc->attr_names;
254
255 =cut
256
257 sub attr_names {
258 my $self = shift;
259 return unless ($self->{attrs});
260 #croak "attr_names return array, not scalar" if (! wantarray);
261 return sort keys %{ $self->{attrs} };
262 }
263
264
265 =head2 attr
266
267 Returns value of an attribute.
268
269 my $value = $doc->attr( 'attribute' );
270
271 =cut
272
273 sub attr {
274 my $self = shift;
275 my $name = shift;
276 return unless (defined($name) && $self->{attrs});
277 return $self->{attrs}->{ $name };
278 }
279
280
281 =head2 texts
282
283 Returns array with text sentences.
284
285 my @texts = $doc->texts;
286
287 =cut
288
289 sub texts {
290 my $self = shift;
291 #confess "texts return array, not scalar" if (! wantarray);
292 return @{ $self->{dtexts} } if ($self->{dtexts});
293 }
294
295
296 =head2 cat_texts
297
298 Return whole text as single scalar.
299
300 my $text = $doc->cat_texts;
301
302 =cut
303
304 sub cat_texts {
305 my $self = shift;
306 return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts});
307 }
308
309
310 =head2 dump_draft
311
312 Dump draft data from document object.
313
314 print $doc->dump_draft;
315
316 =cut
317
318 sub dump_draft {
319 my $self = shift;
320 my $draft;
321
322 foreach my $attr_name (sort keys %{ $self->{attrs} }) {
323 next unless(my $v = $self->{attrs}->{$attr_name});
324 $draft .= $attr_name . '=' . $v . "\n";
325 }
326
327 if ($self->{kwords}) {
328 $draft .= '%%VECTOR';
329 while (my ($key, $value) = each %{ $self->{kwords} }) {
330 $draft .= "\t$key\t$value";
331 }
332 $draft .= "\n";
333 }
334
335 $draft .= "\n";
336
337 $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts});
338 $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts});
339
340 return $draft;
341 }
342
343
344 =head2 delete
345
346 Empty document object
347
348 $doc->delete;
349
350 This function is addition to original Ruby API, and since it was included in C wrappers it's here as a
351 convinience. Document objects which go out of scope will be destroyed
352 automatically.
353
354 =cut
355
356 sub delete {
357 my $self = shift;
358
359 foreach my $data (qw/attrs dtexts stexts kwords/) {
360 delete($self->{$data});
361 }
362
363 $self->{id} = -1;
364
365 return 1;
366 }
367
368
369
370 package Search::Estraier::Condition;
371
372 use Carp qw/confess croak/;
373
374 use Search::Estraier;
375 our @ISA = qw/Search::Estraier/;
376
377 =head1 Search::Estraier::Condition
378
379 =head2 new
380
381 my $cond = new Search::HyperEstraier::Condition;
382
383 =cut
384
385 sub new {
386 my $class = shift;
387 my $self = {};
388 bless($self, $class);
389
390 $self->{max} = -1;
391 $self->{options} = 0;
392
393 $self ? return $self : return undef;
394 }
395
396
397 =head2 set_phrase
398
399 $cond->set_phrase('search phrase');
400
401 =cut
402
403 sub set_phrase {
404 my $self = shift;
405 $self->{phrase} = $self->_s( shift );
406 }
407
408
409 =head2 add_attr
410
411 $cond->add_attr('@URI STRINC /~dpavlin/');
412
413 =cut
414
415 sub add_attr {
416 my $self = shift;
417 my $attr = shift || return;
418 push @{ $self->{attrs} }, $self->_s( $attr );
419 }
420
421
422 =head2 set_order
423
424 $cond->set_order('@mdate NUMD');
425
426 =cut
427
428 sub set_order {
429 my $self = shift;
430 $self->{order} = shift;
431 }
432
433
434 =head2 set_max
435
436 $cond->set_max(42);
437
438 =cut
439
440 sub set_max {
441 my $self = shift;
442 my $max = shift;
443 croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/);
444 $self->{max} = $max;
445 }
446
447
448 =head2 set_options
449
450 $cond->set_options( SURE => 1 );
451
452 =cut
453
454 my $options = {
455 # check N-gram keys skipping by three
456 SURE => 1 << 0,
457 # check N-gram keys skipping by two
458 USUAL => 1 << 1,
459 # without TF-IDF tuning
460 FAST => 1 << 2,
461 # with the simplified phrase
462 AGITO => 1 << 3,
463 # check every N-gram key
464 NOIDF => 1 << 4,
465 # check N-gram keys skipping by one
466 SIMPLE => 1 << 10,
467 };
468
469 sub set_options {
470 my $self = shift;
471 my $option = shift;
472 confess "unknown option" unless ($options->{$option});
473 $self->{options} ||= $options->{$option};
474 }
475
476
477 =head2 phrase
478
479 Return search phrase.
480
481 print $cond->phrase;
482
483 =cut
484
485 sub phrase {
486 my $self = shift;
487 return $self->{phrase};
488 }
489
490
491 =head2 order
492
493 Return search result order.
494
495 print $cond->order;
496
497 =cut
498
499 sub order {
500 my $self = shift;
501 return $self->{order};
502 }
503
504
505 =head2 attrs
506
507 Return search result attrs.
508
509 my @cond_attrs = $cond->attrs;
510
511 =cut
512
513 sub attrs {
514 my $self = shift;
515 #croak "attrs return array, not scalar" if (! wantarray);
516 return @{ $self->{attrs} } if ($self->{attrs});
517 }
518
519
520 =head2 max
521
522 Return maximum number of results.
523
524 print $cond->max;
525
526 C<-1> is returned for unitialized value, C<0> is unlimited.
527
528 =cut
529
530 sub max {
531 my $self = shift;
532 return $self->{max};
533 }
534
535
536 =head2 options
537
538 Return options for this condition.
539
540 print $cond->options;
541
542 Options are returned in numerical form.
543
544 =cut
545
546 sub options {
547 my $self = shift;
548 return $self->{options};
549 }
550
551
552 package Search::Estraier::ResultDocument;
553
554 use Carp qw/croak/;
555
556 #use Search::Estraier;
557 #our @ISA = qw/Search::Estraier/;
558
559 =head1 Search::Estraier::ResultDocument
560
561 =head2 new
562
563 my $rdoc = new Search::HyperEstraier::ResultDocument(
564 uri => 'http://localhost/document/uri/42',
565 attrs => {
566 foo => 1,
567 bar => 2,
568 },
569 snippet => 'this is a text of snippet'
570 keywords => 'this\tare\tkeywords'
571 );
572
573 =cut
574
575 sub new {
576 my $class = shift;
577 my $self = {@_};
578 bless($self, $class);
579
580 croak "missing uri for ResultDocument" unless defined($self->{uri});
581
582 $self ? return $self : return undef;
583 }
584
585
586 =head2 uri
587
588 Return URI of result document
589
590 print $rdoc->uri;
591
592 =cut
593
594 sub uri {
595 my $self = shift;
596 return $self->{uri};
597 }
598
599
600 =head2 attr_names
601
602 Returns array with attribute names from result document object.
603
604 my @attrs = $rdoc->attr_names;
605
606 =cut
607
608 sub attr_names {
609 my $self = shift;
610 croak "attr_names return array, not scalar" if (! wantarray);
611 return sort keys %{ $self->{attrs} };
612 }
613
614
615 =head2 attr
616
617 Returns value of an attribute.
618
619 my $value = $rdoc->attr( 'attribute' );
620
621 =cut
622
623 sub attr {
624 my $self = shift;
625 my $name = shift || return;
626 return $self->{attrs}->{ $name };
627 }
628
629
630 =head2 snippet
631
632 Return snippet from result document
633
634 print $rdoc->snippet;
635
636 =cut
637
638 sub snippet {
639 my $self = shift;
640 return $self->{snippet};
641 }
642
643
644 =head2 keywords
645
646 Return keywords from result document
647
648 print $rdoc->keywords;
649
650 =cut
651
652 sub keywords {
653 my $self = shift;
654 return $self->{keywords};
655 }
656
657
658 package Search::Estraier::NodeResult;
659
660 use Carp qw/croak/;
661
662 #use Search::Estraier;
663 #our @ISA = qw/Search::Estraier/;
664
665 =head1 Search::Estraier::NodeResult
666
667 =head2 new
668
669 my $res = new Search::HyperEstraier::NodeResult(
670 docs => @array_of_rdocs,
671 hits => %hash_with_hints,
672 );
673
674 =cut
675
676 sub new {
677 my $class = shift;
678 my $self = {@_};
679 bless($self, $class);
680
681 foreach my $f (qw/docs hints/) {
682 croak "missing $f for ResultDocument" unless defined($self->{$f});
683 }
684
685 $self ? return $self : return undef;
686 }
687
688
689 =head2 doc_num
690
691 Return number of documents
692
693 print $res->doc_num;
694
695 =cut
696
697 sub doc_num {
698 my $self = shift;
699 return $#{$self->{docs}} + 1;
700 }
701
702
703 =head2 get_doc
704
705 Return single document
706
707 my $doc = $res->get_doc( 42 );
708
709 Returns undef if document doesn't exist.
710
711 =cut
712
713 sub get_doc {
714 my $self = shift;
715 my $num = shift;
716 croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/);
717 return undef if ($num < 0 || $num > $self->{docs});
718 return $self->{docs}->[$num];
719 }
720
721
722 =head2 hint
723
724 Return specific hint from results.
725
726 print $rec->hint( 'VERSION' );
727
728 Possible hints are: C<VERSION>, C<NODE>, C<HIT>, C<HINT#n>, C<DOCNUM>, C<WORDNUM>,
729 C<TIME>, C<LINK#n>, C<VIEW>.
730
731 =cut
732
733 sub hint {
734 my $self = shift;
735 my $key = shift || return;
736 return $self->{hints}->{$key};
737 }
738
739 =head2 hints
740
741 More perlish version of C<hint>. This one returns hash.
742
743 my %hints = $rec->hints;
744
745 =cut
746
747 sub hints {
748 my $self = shift;
749 return $self->{hints};
750 }
751
752 package Search::Estraier::Node;
753
754 use Carp qw/carp croak confess/;
755 use URI;
756 use MIME::Base64;
757 use IO::Socket::INET;
758 use URI::Escape qw/uri_escape/;
759
760 =head1 Search::Estraier::Node
761
762 =head2 new
763
764 my $node = new Search::HyperEstraier::Node;
765
766 or optionally with C<url> as parametar
767
768 my $node = new Search::HyperEstraier::Node( 'http://localhost:1978/node/test' );
769
770 or in more verbose form
771
772 my $node = new Search::HyperEstraier::Node(
773 url => 'http://localhost:1978/node/test',
774 debug => 1,
775 croak_on_error => 1
776 );
777
778 with following arguments:
779
780 =over 4
781
782 =item url
783
784 URL to node
785
786 =item debug
787
788 dumps a B<lot> of debugging output
789
790 =item croak_on_error
791
792 very helpful during development. It will croak on all errors instead of
793 silently returning C<-1> (which is convention of Hyper Estraier API in other
794 languages).
795
796 =back
797
798 =cut
799
800 sub new {
801 my $class = shift;
802 my $self = {
803 pxport => -1,
804 timeout => 0, # this used to be -1
805 dnum => -1,
806 wnum => -1,
807 size => -1.0,
808 wwidth => 480,
809 hwidth => 96,
810 awidth => 96,
811 status => -1,
812 };
813 bless($self, $class);
814
815 if ($#_ == 0) {
816 $self->{url} = shift;
817 } else {
818 my $args = {@_};
819
820 %$self = ( %$self, @_ );
821
822 warn "## Node debug on\n" if ($self->{debug});
823 }
824
825 $self ? return $self : return undef;
826 }
827
828
829 =head2 set_url
830
831 Specify URL to node server
832
833 $node->set_url('http://localhost:1978');
834
835 =cut
836
837 sub set_url {
838 my $self = shift;
839 $self->{url} = shift;
840 }
841
842
843 =head2 set_proxy
844
845 Specify proxy server to connect to node server
846
847 $node->set_proxy('proxy.example.com', 8080);
848
849 =cut
850
851 sub set_proxy {
852 my $self = shift;
853 my ($host,$port) = @_;
854 croak "proxy port must be number, not '$port'" unless ($port =~ m/^\d+$/);
855 $self->{pxhost} = $host;
856 $self->{pxport} = $port;
857 }
858
859
860 =head2 set_timeout
861
862 Specify timeout of connection in seconds
863
864 $node->set_timeout( 15 );
865
866 =cut
867
868 sub set_timeout {
869 my $self = shift;
870 my $sec = shift;
871 croak "timeout must be number, not '$sec'" unless ($sec =~ m/^\d+$/);
872 $self->{timeout} = $sec;
873 }
874
875
876 =head2 set_auth
877
878 Specify name and password for authentication to node server.
879
880 $node->set_auth('clint','eastwood');
881
882 =cut
883
884 sub set_auth {
885 my $self = shift;
886 my ($login,$passwd) = @_;
887 my $basic_auth = encode_base64( "$login:$passwd" );
888 chomp($basic_auth);
889 $self->{auth} = $basic_auth;
890 }
891
892
893 =head2 status
894
895 Return status code of last request.
896
897 print $node->status;
898
899 C<-1> means connection failure.
900
901 =cut
902
903 sub status {
904 my $self = shift;
905 return $self->{status};
906 }
907
908
909 =head2 put_doc
910
911 Add a document
912
913 $node->put_doc( $document_draft ) or die "can't add document";
914
915 Return true on success or false on failture.
916
917 =cut
918
919 sub put_doc {
920 my $self = shift;
921 my $doc = shift || return;
922 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
923 $self->shuttle_url( $self->{url} . '/put_doc',
924 'text/x-estraier-draft',
925 $doc->dump_draft,
926 undef
927 ) == 200;
928 }
929
930
931 =head2 out_doc
932
933 Remove a document
934
935 $node->out_doc( document_id ) or "can't remove document";
936
937 Return true on success or false on failture.
938
939 =cut
940
941 sub out_doc {
942 my $self = shift;
943 my $id = shift || return;
944 return unless ($self->{url});
945 croak "id must be number, not '$id'" unless ($id =~ m/^\d+$/);
946 $self->shuttle_url( $self->{url} . '/out_doc',
947 'application/x-www-form-urlencoded',
948 "id=$id",
949 undef
950 ) == 200;
951 }
952
953
954 =head2 out_doc_by_uri
955
956 Remove a registrated document using it's uri
957
958 $node->out_doc_by_uri( 'file:///document/uri/42' ) or "can't remove document";
959
960 Return true on success or false on failture.
961
962 =cut
963
964 sub out_doc_by_uri {
965 my $self = shift;
966 my $uri = shift || return;
967 return unless ($self->{url});
968 $self->shuttle_url( $self->{url} . '/out_doc',
969 'application/x-www-form-urlencoded',
970 "uri=" . uri_escape($uri),
971 undef
972 ) == 200;
973 }
974
975
976 =head2 edit_doc
977
978 Edit attributes of a document
979
980 $node->edit_doc( $document_draft ) or die "can't edit document";
981
982 Return true on success or false on failture.
983
984 =cut
985
986 sub edit_doc {
987 my $self = shift;
988 my $doc = shift || return;
989 return unless ($self->{url} && $doc->isa('Search::Estraier::Document'));
990 $self->shuttle_url( $self->{url} . '/edit_doc',
991 'text/x-estraier-draft',
992 $doc->dump_draft,
993 undef
994 ) == 200;
995 }
996
997
998 =head2 get_doc
999
1000 Retreive document
1001
1002 my $doc = $node->get_doc( document_id ) or die "can't get document";
1003
1004 Return true on success or false on failture.
1005
1006 =cut
1007
1008 sub get_doc {
1009 my $self = shift;
1010 my $id = shift || return;
1011 return $self->_fetch_doc( id => $id );
1012 }
1013
1014
1015 =head2 get_doc_by_uri
1016
1017 Retreive document
1018
1019 my $doc = $node->get_doc_by_uri( 'file:///document/uri/42' ) or die "can't get document";
1020
1021 Return true on success or false on failture.
1022
1023 =cut
1024
1025 sub get_doc_by_uri {
1026 my $self = shift;
1027 my $uri = shift || return;
1028 return $self->_fetch_doc( uri => $uri );
1029 }
1030
1031
1032 =head2 get_doc_attr
1033
1034 Retrieve the value of an atribute from object
1035
1036 my $val = $node->get_doc_attr( document_id, 'attribute_name' ) or
1037 die "can't get document attribute";
1038
1039 =cut
1040
1041 sub get_doc_attr {
1042 my $self = shift;
1043 my ($id,$name) = @_;
1044 return unless ($id && $name);
1045 return $self->_fetch_doc( id => $id, attr => $name );
1046 }
1047
1048
1049 =head2 get_doc_attr_by_uri
1050
1051 Retrieve the value of an atribute from object
1052
1053 my $val = $node->get_doc_attr_by_uri( document_id, 'attribute_name' ) or
1054 die "can't get document attribute";
1055
1056 =cut
1057
1058 sub get_doc_attr_by_uri {
1059 my $self = shift;
1060 my ($uri,$name) = @_;
1061 return unless ($uri && $name);
1062 return $self->_fetch_doc( uri => $uri, attr => $name );
1063 }
1064
1065
1066 =head2 etch_doc
1067
1068 Exctract document keywords
1069
1070 my $keywords = $node->etch_doc( document_id ) or die "can't etch document";
1071
1072 =cut
1073
1074 sub etch_doc {
1075 my $self = shift;
1076 my $id = shift || return;
1077 return $self->_fetch_doc( id => $id, etch => 1 );
1078 }
1079
1080 =head2 etch_doc_by_uri
1081
1082 Retreive document
1083
1084 my $keywords = $node->etch_doc_by_uri( 'file:///document/uri/42' ) or die "can't etch document";
1085
1086 Return true on success or false on failture.
1087
1088 =cut
1089
1090 sub etch_doc_by_uri {
1091 my $self = shift;
1092 my $uri = shift || return;
1093 return $self->_fetch_doc( uri => $uri, etch => 1 );
1094 }
1095
1096
1097 =head2 uri_to_id
1098
1099 Get ID of document specified by URI
1100
1101 my $id = $node->uri_to_id( 'file:///document/uri/42' );
1102
1103 =cut
1104
1105 sub uri_to_id {
1106 my $self = shift;
1107 my $uri = shift || return;
1108 return $self->_fetch_doc( uri => $uri, path => '/uri_to_id', chomp_resbody => 1 );
1109 }
1110
1111
1112 =head2 _fetch_doc
1113
1114 Private function used for implementing of C<get_doc>, C<get_doc_by_uri>,
1115 C<etch_doc>, C<etch_doc_by_uri>.
1116
1117 # this will decode received draft into Search::Estraier::Document object
1118 my $doc = $node->_fetch_doc( id => 42 );
1119 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42' );
1120
1121 # to extract keywords, add etch
1122 my $doc = $node->_fetch_doc( id => 42, etch => 1 );
1123 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', etch => 1 );
1124
1125 # to get document attrubute add attr
1126 my $doc = $node->_fetch_doc( id => 42, attr => '@mdate' );
1127 my $doc = $node->_fetch_doc( uri => 'file:///document/uri/42', attr => '@mdate' );
1128
1129 # more general form which allows implementation of
1130 # uri_to_id
1131 my $id = $node->_fetch_doc(
1132 uri => 'file:///document/uri/42',
1133 path => '/uri_to_id',
1134 chomp_resbody => 1
1135 );
1136
1137 =cut
1138
1139 sub _fetch_doc {
1140 my $self = shift;
1141 my $a = {@_};
1142 return unless ( ($a->{id} || $a->{uri}) && $self->{url} );
1143
1144 my ($arg, $resbody);
1145
1146 my $path = $a->{path} || '/get_doc';
1147 $path = '/etch_doc' if ($a->{etch});
1148
1149 if ($a->{id}) {
1150 croak "id must be numberm not '$a->{id}'" unless ($a->{id} =~ m/^\d+$/);
1151 $arg = 'id=' . $a->{id};
1152 } elsif ($a->{uri}) {
1153 $arg = 'uri=' . uri_escape($a->{uri});
1154 } else {
1155 confess "unhandled argument. Need id or uri.";
1156 }
1157
1158 if ($a->{attr}) {
1159 $path = '/get_doc_attr';
1160 $arg .= '&attr=' . uri_escape($a->{attr});
1161 $a->{chomp_resbody} = 1;
1162 }
1163
1164 my $rv = $self->shuttle_url( $self->{url} . $path,
1165 'application/x-www-form-urlencoded',
1166 $arg,
1167 \$resbody,
1168 );
1169
1170 return if ($rv != 200);
1171
1172 if ($a->{etch}) {
1173 $self->{kwords} = {};
1174 return +{} unless ($resbody);
1175 foreach my $l (split(/\n/, $resbody)) {
1176 my ($k,$v) = split(/\t/, $l, 2);
1177 $self->{kwords}->{$k} = $v if ($v);
1178 }
1179 return $self->{kwords};
1180 } elsif ($a->{chomp_resbody}) {
1181 return unless (defined($resbody));
1182 chomp($resbody);
1183 return $resbody;
1184 } else {
1185 return new Search::Estraier::Document($resbody);
1186 }
1187 }
1188
1189
1190 =head2 name
1191
1192 my $node_name = $node->name;
1193
1194 =cut
1195
1196 sub name {
1197 my $self = shift;
1198 $self->_set_info unless ($self->{name});
1199 return $self->{name};
1200 }
1201
1202
1203 =head2 label
1204
1205 my $node_label = $node->label;
1206
1207 =cut
1208
1209 sub label {
1210 my $self = shift;
1211 $self->_set_info unless ($self->{label});
1212 return $self->{label};
1213 }
1214
1215
1216 =head2 doc_num
1217
1218 my $documents_in_node = $node->doc_num;
1219
1220 =cut
1221
1222 sub doc_num {
1223 my $self = shift;
1224 $self->_set_info if ($self->{dnum} < 0);
1225 return $self->{dnum};
1226 }
1227
1228
1229 =head2 word_num
1230
1231 my $words_in_node = $node->word_num;
1232
1233 =cut
1234
1235 sub word_num {
1236 my $self = shift;
1237 $self->_set_info if ($self->{wnum} < 0);
1238 return $self->{wnum};
1239 }
1240
1241
1242 =head2 size
1243
1244 my $node_size = $node->size;
1245
1246 =cut
1247
1248 sub size {
1249 my $self = shift;
1250 $self->_set_info if ($self->{size} < 0);
1251 return $self->{size};
1252 }
1253
1254
1255 =head2 search
1256
1257 Search documents which match condition
1258
1259 my $nres = $node->search( $cond, $depth );
1260
1261 C<$cond> is C<Search::Estraier::Condition> object, while <$depth> specifies
1262 depth for meta search.
1263
1264 Function results C<Search::Estraier::NodeResult> object.
1265
1266 =cut
1267
1268 sub search {
1269 my $self = shift;
1270 my ($cond, $depth) = @_;
1271 return unless ($cond && defined($depth) && $self->{url});
1272 croak "cond mush be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1273 croak "depth needs number, not '$depth'" unless ($depth =~ m/^\d+$/);
1274
1275 my $resbody;
1276
1277 my $rv = $self->shuttle_url( $self->{url} . '/search',
1278 'application/x-www-form-urlencoded',
1279 $self->cond_to_query( $cond, $depth ),
1280 \$resbody,
1281 );
1282 return if ($rv != 200);
1283
1284 my (@docs, $hints);
1285
1286 my @lines = split(/\n/, $resbody);
1287 return unless (@lines);
1288
1289 my $border = $lines[0];
1290 my $isend = 0;
1291 my $lnum = 1;
1292
1293 while ( $lnum <= $#lines ) {
1294 my $line = $lines[$lnum];
1295 $lnum++;
1296
1297 #warn "## $line\n";
1298 if ($line && $line =~ m/^\Q$border\E(:END)*$/) {
1299 $isend = $1;
1300 last;
1301 }
1302
1303 if ($line =~ /\t/) {
1304 my ($k,$v) = split(/\t/, $line, 2);
1305 $hints->{$k} = $v;
1306 }
1307 }
1308
1309 my $snum = $lnum;
1310
1311 while( ! $isend && $lnum <= $#lines ) {
1312 my $line = $lines[$lnum];
1313 #warn "# $lnum: $line\n";
1314 $lnum++;
1315
1316 if ($line && $line =~ m/^\Q$border\E/) {
1317 if ($lnum > $snum) {
1318 my $rdattrs;
1319 my $rdvector;
1320 my $rdsnippet;
1321
1322 my $rlnum = $snum;
1323 while ($rlnum < $lnum - 1 ) {
1324 #my $rdline = $self->_s($lines[$rlnum]);
1325 my $rdline = $lines[$rlnum];
1326 $rlnum++;
1327 last unless ($rdline);
1328 if ($rdline =~ /^%/) {
1329 $rdvector = $1 if ($rdline =~ /^%VECTOR\t(.+)$/);
1330 } elsif($rdline =~ /=/) {
1331 $rdattrs->{$1} = $2 if ($rdline =~ /^(.+)=(.+)$/);
1332 } else {
1333 confess "invalid format of response";
1334 }
1335 }
1336 while($rlnum < $lnum - 1) {
1337 my $rdline = $lines[$rlnum];
1338 $rlnum++;
1339 $rdsnippet .= "$rdline\n";
1340 }
1341 #warn Dumper($rdvector, $rdattrs, $rdsnippet);
1342 if (my $rduri = $rdattrs->{'@uri'}) {
1343 push @docs, new Search::Estraier::ResultDocument(
1344 uri => $rduri,
1345 attrs => $rdattrs,
1346 snippet => $rdsnippet,
1347 keywords => $rdvector,
1348 );
1349 }
1350 }
1351 $snum = $lnum;
1352 #warn "### $line\n";
1353 $isend = 1 if ($line =~ /:END$/);
1354 }
1355
1356 }
1357
1358 if (! $isend) {
1359 warn "received result doesn't have :END\n$resbody";
1360 return;
1361 }
1362
1363 #warn Dumper(\@docs, $hints);
1364
1365 return new Search::Estraier::NodeResult( docs => \@docs, hints => $hints );
1366 }
1367
1368
1369 =head2 cond_to_query
1370
1371 Return URI encoded string generated from Search::Estraier::Condition
1372
1373 my $args = $node->cond_to_query( $cond, $depth );
1374
1375 =cut
1376
1377 sub cond_to_query {
1378 my $self = shift;
1379
1380 my $cond = shift || return;
1381 croak "condition must be Search::Estraier::Condition, not '$cond->isa'" unless ($cond->isa('Search::Estraier::Condition'));
1382 my $depth = shift;
1383
1384 my @args;
1385
1386 if (my $phrase = $cond->phrase) {
1387 push @args, 'phrase=' . uri_escape($phrase);
1388 }
1389
1390 if (my @attrs = $cond->attrs) {
1391 for my $i ( 0 .. $#attrs ) {
1392 push @args,'attr' . ($i+1) . '=' . uri_escape( $attrs[$i] ) if ($attrs[$i]);
1393 }
1394 }
1395
1396 if (my $order = $cond->order) {
1397 push @args, 'order=' . uri_escape($order);
1398 }
1399
1400 if (my $max = $cond->max) {
1401 push @args, 'max=' . $max;
1402 } else {
1403 push @args, 'max=' . (1 << 30);
1404 }
1405
1406 if (my $options = $cond->options) {
1407 push @args, 'options=' . $options;
1408 }
1409
1410 push @args, 'depth=' . $depth if ($depth);
1411 push @args, 'wwidth=' . $self->{wwidth};
1412 push @args, 'hwidth=' . $self->{hwidth};
1413 push @args, 'awidth=' . $self->{awidth};
1414
1415 return join('&', @args);
1416 }
1417
1418
1419 =head2 shuttle_url
1420
1421 This is method which uses C<LWP::UserAgent> to communicate with Hyper Estraier node
1422 master.
1423
1424 my $rv = shuttle_url( $url, $content_type, $req_body, \$resbody );
1425
1426 C<$resheads> and C<$resbody> booleans controll if response headers and/or response
1427 body will be saved within object.
1428
1429 =cut
1430
1431 use LWP::UserAgent;
1432
1433 sub shuttle_url {
1434 my $self = shift;
1435
1436 my ($url, $content_type, $reqbody, $resbody) = @_;
1437
1438 $self->{status} = -1;
1439
1440 warn "## $url\n" if ($self->{debug});
1441
1442 $url = new URI($url);
1443 if (
1444 !$url || !$url->scheme || !$url->scheme eq 'http' ||
1445 !$url->host || !$url->port || $url->port < 1
1446 ) {
1447 carp "can't parse $url\n";
1448 return -1;
1449 }
1450
1451 my $ua = LWP::UserAgent->new;
1452 $ua->agent( "Search-Estraier/$Search::Estraier::VERSION" );
1453
1454 my $req;
1455 if ($reqbody) {
1456 $req = HTTP::Request->new(POST => $url);
1457 } else {
1458 $req = HTTP::Request->new(GET => $url);
1459 }
1460
1461 $req->headers->header( 'Host' => $url->host . ":" . $url->port );
1462 $req->headers->header( 'Connection', 'close' );
1463 $req->headers->header( 'Authorization', 'Basic ' . $self->{auth} ) if ($self->{auth});
1464 $req->content_type( $content_type );
1465
1466 warn $req->headers->as_string,"\n" if ($self->{debug});
1467
1468 if ($reqbody) {
1469 warn "$reqbody\n" if ($self->{debug});
1470 $req->content( $reqbody );
1471 }
1472
1473 my $res = $ua->request($req) || croak "can't make request to $url: $!";
1474
1475 warn "## response status: ",$res->status_line,"\n" if ($self->{debug});
1476
1477 ($self->{status}, $self->{status_message}) = split(/\s+/, $res->status_line, 2);
1478
1479 if (! $res->is_success) {
1480 if ($self->{croak_on_error}) {
1481 croak("can't get $url: ",$res->status_line);
1482 } else {
1483 return -1;
1484 }
1485 }
1486
1487 $$resbody .= $res->content;
1488
1489 warn "## response body:\n$$resbody\n" if ($resbody && $self->{debug});
1490
1491 return $self->{status};
1492 }
1493
1494
1495 =head2 set_snippet_width
1496
1497 Set width of snippets in results
1498
1499 $node->set_snippet_width( $wwidth, $hwidth, $awidth );
1500
1501 C<$wwidth> specifies whole width of snippet. It's C<480> by default. If it's C<0> snippet
1502 is not sent with results. If it is negative, whole document text is sent instead of snippet.
1503
1504 C<$hwidth> specified width of strings from beginning of string. Default
1505 value is C<96>. Negative or zero value keep previous value.
1506
1507 C<$awidth> specifies width of strings around each highlighted word. It's C<96> by default.
1508 If negative of zero value is provided previous value is kept unchanged.
1509
1510 =cut
1511
1512 sub set_snippet_width {
1513 my $self = shift;
1514
1515 my ($wwidth, $hwidth, $awidth) = @_;
1516 $self->{wwidth} = $wwidth;
1517 $self->{hwidth} = $hwidth if ($hwidth >= 0);
1518 $self->{awidth} = $awidth if ($awidth >= 0);
1519 }
1520
1521
1522 =head2 set_user
1523
1524 Manage users of node
1525
1526 $node->set_user( 'name', $mode );
1527
1528 C<$mode> can be one of:
1529
1530 =over 4
1531
1532 =item 0
1533
1534 delete account
1535
1536 =item 1
1537
1538 set administrative right for user
1539
1540 =item 2
1541
1542 set user account as guest
1543
1544 =back
1545
1546 Return true on success, otherwise false.
1547
1548 =cut
1549
1550 sub set_user {
1551 my $self = shift;
1552 my ($name, $mode) = @_;
1553
1554 return unless ($self->{url});
1555 croak "mode must be number, not '$mode'" unless ($mode =~ m/^\d+$/);
1556
1557 $self->shuttle_url( $self->{url} . '/_set_user',
1558 'text/plain',
1559 'name=' . uri_escape($name) . '&mode=' . $mode,
1560 undef
1561 ) == 200;
1562 }
1563
1564
1565 =head2 set_link
1566
1567 Manage node links
1568
1569 $node->set_link('http://localhost:1978/node/another', 'another node label', $credit);
1570
1571 If C<$credit> is negative, link is removed.
1572
1573 =cut
1574
1575 sub set_link {
1576 my $self = shift;
1577 my ($url, $label, $credit) = @_;
1578
1579 return unless ($self->{url});
1580 croak "mode credit be number, not '$credit'" unless ($credit =~ m/^\d+$/);
1581
1582 my $reqbody = 'url=' . uri_escape($url) . '&label=' . uri_escape($label);
1583 $reqbody .= '&credit=' . $credit if ($credit > 0);
1584
1585 $self->shuttle_url( $self->{url} . '/_set_link',
1586 'application/x-www-form-urlencoded',
1587 $reqbody,
1588 undef
1589 ) == 200;
1590 }
1591
1592
1593 =head1 PRIVATE METHODS
1594
1595 You could call those directly, but you don't have to. I hope.
1596
1597 =head2 _set_info
1598
1599 Set information for node
1600
1601 $node->_set_info;
1602
1603 =cut
1604
1605 sub _set_info {
1606 my $self = shift;
1607
1608 $self->{status} = -1;
1609 return unless ($self->{url});
1610
1611 my $resbody;
1612 my $rv = $self->shuttle_url( $self->{url} . '/inform',
1613 'text/plain',
1614 undef,
1615 \$resbody,
1616 );
1617
1618 return if ($rv != 200 || !$resbody);
1619
1620 # it seems that response can have multiple line endings
1621 $resbody =~ s/[\r\n]+$//;
1622
1623 ( $self->{name}, $self->{label}, $self->{dnum}, $self->{wnum}, $self->{size} ) =
1624 split(/\t/, $resbody, 5);
1625
1626 }
1627
1628 ###
1629
1630 =head1 EXPORT
1631
1632 Nothing.
1633
1634 =head1 SEE ALSO
1635
1636 L<http://hyperestraier.sourceforge.net/>
1637
1638 Hyper Estraier Ruby interface on which this module is based.
1639
1640 =head1 AUTHOR
1641
1642 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
1643
1644
1645 =head1 COPYRIGHT AND LICENSE
1646
1647 Copyright (C) 2005-2006 by Dobrica Pavlinusic
1648
1649 This library is free software; you can redistribute it and/or modify
1650 it under the GPL v2 or later.
1651
1652 =cut
1653
1654 1;

  ViewVC Help
Powered by ViewVC 1.1.26