/[webpac2]/trunk/lib/WebPAC/Output/Estraier.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/Estraier.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 301 - (hide annotations)
Mon Dec 19 21:26:04 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 8412 byte(s)
 r322@athlon:  dpavlin | 2005-12-19 22:27:06 +0100
 make run.pl moderatly chatty (along with other modules), added command line options
 (try perldoc run.pl) new target index (to reindex all) and run (to index
 first 100 records of each database)

1 dpavlin 1 package WebPAC::Output::Estraier;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 74 use base qw/WebPAC::Common/;
7    
8     use HyperEstraier;
9     use Text::Iconv;
10     use Data::Dumper;
11 dpavlin 211 use LWP;
12 dpavlin 210 use URI::Escape;
13 dpavlin 74
14 dpavlin 1 =head1 NAME
15    
16 dpavlin 74 WebPAC::Output::Estraier - Create Hyper Estraier full text index
17 dpavlin 1
18     =head1 VERSION
19    
20 dpavlin 291 Version 0.07
21 dpavlin 1
22     =cut
23    
24 dpavlin 291 our $VERSION = '0.07';
25 dpavlin 1
26     =head1 SYNOPSIS
27    
28 dpavlin 74 Create full text index using Hyper Estraier index from data with
29     type C<search>.
30 dpavlin 1
31 dpavlin 74 =head1 FUNCTIONS
32 dpavlin 1
33 dpavlin 74 =head2 new
34 dpavlin 1
35 dpavlin 74 Connect to Hyper Estraier index using HTTP
36 dpavlin 1
37 dpavlin 74 my $est = new WebPAC::Output::Estraier(
38 dpavlin 210 masterurl => 'http://localhost:1978/',
39 dpavlin 74 user => 'admin',
40     passwd => 'admin',
41     database => 'demo',
42     encoding => 'iso-8859-2',
43     );
44 dpavlin 1
45 dpavlin 74 Options are:
46 dpavlin 1
47 dpavlin 74 =over 4
48 dpavlin 1
49 dpavlin 210 =item masterurl
50 dpavlin 1
51 dpavlin 74 URI to C<estmaster> node
52    
53     =item user
54    
55     C<estmaster> user with sufficient rights
56    
57     =item passwd
58    
59     password for user
60    
61     =item database
62    
63     name of database from which data comes
64    
65     =item encoding
66    
67     character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
68     (and it probably is). This encoding will be converted to C<UTF-8> for
69     Hyper Estraier.
70    
71     =back
72    
73     Name of database will be used to form URI of documents in index.
74    
75 dpavlin 1 =cut
76    
77 dpavlin 74 sub new {
78     my $class = shift;
79     my $self = {@_};
80     bless($self, $class);
81    
82     my $log = $self->_get_logger;
83    
84 dpavlin 211 #$log->debug("self: ", sub { Dumper($self) });
85 dpavlin 210
86     foreach my $p (qw/masterurl user passwd database/) {
87 dpavlin 74 $log->logdie("need $p") unless ($self->{$p});
88     }
89    
90 dpavlin 210 my $url = $self->{masterurl} . '/node/' . $self->{database};
91     $self->{url} = $url;
92 dpavlin 74
93 dpavlin 301 $log->debug("opening index $self->{url}");
94 dpavlin 74
95 dpavlin 255 my $nodes = $self->master( action => 'nodelist' );
96 dpavlin 210
97 dpavlin 211 $log->debug("nodes found: $nodes");
98    
99     if ($nodes !~ m/^$self->{database}\t/sm) {
100 dpavlin 301 $log->warn("creating index $url");
101 dpavlin 255 $self->master(
102 dpavlin 210 action => 'nodeadd',
103     name => $self->{database},
104     label => "WebPAC $self->{database}",
105     ) || $log->logdie("can't create Hyper Estraier node $self->{database}");
106     }
107    
108     $self->{'db'} = HyperEstraier::Node->new($self->{url});
109     $self->{'db'}->set_auth($self->{'user'}, $self->{passwd});
110    
111 dpavlin 74 my $encoding = $self->{'encoding'} || 'ISO-8859-2';
112 dpavlin 301 $log->info("using index $self->{url} with encoding $encoding");
113 dpavlin 74
114 dpavlin 75 $self->{'iconv'} = new Text::Iconv($encoding, 'UTF-8') or
115 dpavlin 85 $log->logdie("can't create conversion from $encoding to UTF-8");
116 dpavlin 74
117     $self ? return $self : return undef;
118 dpavlin 1 }
119    
120 dpavlin 75
121 dpavlin 74 =head2 add
122 dpavlin 1
123 dpavlin 74 Adds one entry to database.
124    
125     $est->add(
126     id => 42,
127     ds => $ds,
128     type => 'display',
129     text => 'optional text from which snippet is created',
130     );
131    
132     This function will create entries in index using following URI format:
133    
134 dpavlin 212 C<file:///type/database%20name/000>
135 dpavlin 74
136     Each tag in C<data_structure> with specified C<type> will create one
137     attribute and corresponding hidden text (used for search).
138    
139 dpavlin 1 =cut
140    
141 dpavlin 74 sub add {
142     my $self = shift;
143    
144     my $args = {@_};
145    
146     my $log = $self->_get_logger;
147    
148     my $database = $self->{'database'} || $log->logconfess('no database in $self');
149     $log->logconfess('need db in object') unless ($self->{'db'});
150    
151     foreach my $p (qw/id ds type/) {
152     $log->logdie("need $p") unless ($args->{$p});
153     }
154    
155     my $type = $args->{'type'};
156 dpavlin 213 my $id = $args->{'id'};
157 dpavlin 74
158 dpavlin 213 my $uri = "file:///$type/$database/$id";
159 dpavlin 74 $log->debug("creating $uri");
160    
161     my $doc = HyperEstraier::Document->new;
162 dpavlin 75 $doc->add_attr('@uri', $self->{'iconv'}->convert($uri) );
163 dpavlin 74
164     $log->debug("ds = ", sub { Dumper($args->{'ds'}) } );
165    
166     # filter all tags which have type defined
167     my @tags = grep {
168 dpavlin 113 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
169 dpavlin 74 } keys %{ $args->{'ds'} };
170    
171     $log->debug("tags = ", join(",", @tags));
172    
173     return unless (@tags);
174    
175     foreach my $tag (@tags) {
176    
177     my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
178    
179     $log->logconfess("no values for $tag/$type") unless ($vals);
180    
181 dpavlin 75 $vals = $self->{'iconv'}->convert( $vals ) or
182 dpavlin 85 $log->logdie("can't convert '$vals' to UTF-8");
183 dpavlin 75
184     $doc->add_attr( $tag, $vals );
185     $doc->add_hidden_text( $vals );
186 dpavlin 74 }
187    
188     my $text = $args->{'text'};
189 dpavlin 75 if ( $text ) {
190     $text = $self->{'iconv'}->convert( $text ) or
191 dpavlin 85 $log->logdie("can't convert '$text' to UTF-8");
192 dpavlin 75 $doc->add_text( $text );
193     }
194 dpavlin 74
195     $log->debug("adding ", sub { $doc->dump_draft } );
196 dpavlin 212 $self->{'db'}->put_doc($doc) || $log->logdie("can't add document $uri to node " . $self->{url} . " status: " . $self->{db}->status());
197 dpavlin 74
198     return 1;
199 dpavlin 1 }
200    
201 dpavlin 255 #
202     # REST parametars validation data
203     #
204 dpavlin 210
205 dpavlin 255 my $estraier_rest = {
206     master => {
207     userdel => [ qw/name/ ],
208     nodelist => [],
209     nodeadd => [ qw/name label/ ],
210     nodedel => [ qw/name/ ],
211     },
212     node => {
213     _set_link => [ qw/url label credit/ ],
214     },
215     };
216    
217     =head2 master
218    
219 dpavlin 210 Issue administrative commands to C<estmaster> process and receive response
220     as array of lines
221    
222 dpavlin 255 my $nodelist = $est->master( action => 'nodelist' );
223 dpavlin 210
224     =cut
225    
226 dpavlin 255 sub master {
227     my $self = shift;
228 dpavlin 210
229 dpavlin 255 my $args = {@_};
230     my $log = $self->_get_logger;
231 dpavlin 246
232 dpavlin 255 my $action = $args->{action} || $log->logconfess("no action specified");
233 dpavlin 246
234 dpavlin 255 $log->logdie("action '$action' isn't supported") unless ($estraier_rest->{master}->{$action});
235 dpavlin 246
236 dpavlin 255 $log->debug("master action: $action");
237    
238     return $self->estcall(
239     validate => 'master',
240     rest_url => $self->{masterurl} . '/master?action=' . $action ,
241     action => $action,
242 dpavlin 284 %{ $args },
243 dpavlin 255 );
244     }
245    
246     =head2 add_link
247    
248     $est->add_link(
249     from => 'ps',
250     to => 'webpac2',
251     credit => 10000,
252     );
253    
254 dpavlin 246 =cut
255    
256 dpavlin 255 sub add_link {
257 dpavlin 238 my $self = shift;
258    
259 dpavlin 255 my $args = {@_};
260     my $log = $self->_get_logger;
261 dpavlin 238
262 dpavlin 255 my @labels = $self->master( action => 'nodelist' );
263 dpavlin 238
264 dpavlin 255 $log->debug("got labels: ", join("|", @labels));
265 dpavlin 238
266 dpavlin 255 @labels = grep(/^$args->{to}/, @labels);
267    
268     my (undef,$label) = split(/\t/, shift @labels);
269    
270     if (! $label) {
271     $log->warn("can't find label for $args->{to}, skipping link creaton");
272     return;
273     }
274    
275     $log->debug("using label $label for $args->{to}");
276    
277     return $self->estcall(
278     validate => 'node',
279     action => '_set_link',
280     rest_url => $self->{masterurl} . '/node/' . $args->{from} . '/_set_link' ,
281     url => $self->{masterurl} . '/node/' . $args->{to},
282     label => $label,
283     credit => $args->{credit},
284     );
285 dpavlin 238 }
286    
287 dpavlin 255 =head2 estcall
288    
289     Workhourse which does actual calls to Hyper Estraier
290    
291     $self->estcall(
292     rest_url => '/master?action=' . $action,
293     validate => 'master',
294     # ...
295     );
296    
297     C<rest_url> is relative URL to C<estmaster> and C<validate> is entry into
298     internal hash which will check if all parametars are available before
299     calling function.
300    
301     =cut
302    
303     sub estcall {
304 dpavlin 210 my $self = shift;
305     my $args = {@_};
306     my $log = $self->_get_logger;
307    
308 dpavlin 255 $log->debug("estcall: ",Dumper($args));
309 dpavlin 210
310 dpavlin 255 foreach my $p (qw/rest_url validate action/) {
311     $log->die("ectcall needs $p parametar") unless ($args->{$p});
312     }
313 dpavlin 210
314 dpavlin 255 my $url = $args->{rest_url};
315     my $del = '?';
316     $del = '&' if ($url =~ m#\?#);
317 dpavlin 210
318 dpavlin 255 my $url_args;
319 dpavlin 210
320 dpavlin 255 foreach my $arg (@{ $estraier_rest->{ $args->{validate} }->{ $args->{action} } }) {
321     $log->logdie("missing parametar $arg for action $args->{action}") unless ($args->{$arg});
322     $url_args .= $del . $arg . '=' . uri_escape( $args->{$arg} );
323     $del = '&';
324 dpavlin 210 }
325    
326 dpavlin 284 $url .= $url_args if ($url_args);
327 dpavlin 255
328 dpavlin 210 $log->debug("calling $url");
329    
330 dpavlin 238 my $res = $self->est_ua()->get($url);
331 dpavlin 210
332 dpavlin 211 if ($res->is_success) {
333     #$log->debug( $res->content );
334     return split(/\n/, $res->content) if wantarray;
335 dpavlin 255 return $res->content || 0E0;
336 dpavlin 211 } else {
337     $log->warn("unable to call $url: " . $res->status_line);
338 dpavlin 210 return;
339     }
340    
341     }
342    
343 dpavlin 255 =head2 est_ua
344    
345     This is helper function to create C<LWP::UserAgent> object with Super User
346     priviledges
347    
348     my $ua = $self->est_ua( user => 'admin', passwd => 'admin' );
349    
350     =cut
351    
352 dpavlin 256
353    
354 dpavlin 255 sub est_ua {
355     my $self = shift;
356    
357     return $self->{_master_ua} if ($self->{_master_ua});
358    
359 dpavlin 256 {
360     package AdminUserAgent;
361     use base qw/LWP::UserAgent/;
362     sub new {
363     my $self = LWP::UserAgent::new(@_);
364     $self->agent("webpac/$VERSION");
365     $self;
366     }
367     sub get_basic_credentials {
368     my($self, $realm, $uri) = @_;
369     return ($self->{user}, $self->{passwd});
370     }
371     sub set_basic_credentials {
372     my ($self, $user, $passwd) = @_;
373     $self->{user} = $user;
374     $self->{passwd} = $passwd;
375     }
376     };
377    
378     $self->{_master_ua} = AdminUserAgent->new( ) || sub {
379 dpavlin 255 my $log = $self->_get_logger;
380     $log->logdie("can't create LWP::UserAgent: $!");
381     };
382    
383 dpavlin 256 $self->{_master_ua}->set_basic_credentials($self->{user}, $self->{passwd});
384 dpavlin 255
385     return $self->{_master_ua};
386     }
387    
388 dpavlin 1 =head1 AUTHOR
389    
390     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
391    
392     =head1 COPYRIGHT & LICENSE
393    
394     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
395    
396     This program is free software; you can redistribute it and/or modify it
397     under the same terms as Perl itself.
398    
399     =cut
400    
401     1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26