/[webpac2]/trunk/lib/WebPAC/Output/EstraierNative.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Output/EstraierNative.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 887 - (show annotations)
Mon Sep 3 15:26:46 2007 UTC (16 years, 7 months ago) by dpavlin
File size: 6239 byte(s)
 r1322@llin:  dpavlin | 2007-09-03 16:44:01 +0200
 - replace Data::Dumper usage with Data::Dump
 - rewrite WebPAC::Store to use Class::Accessor

1 package WebPAC::Output::EstraierNative;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7
8 use Estraier;
9 use Encode qw/from_to/;
10 use Data::Dump;
11 use LWP;
12 use URI::Escape;
13 use List::Util qw/first/;
14 use File::Path;
15
16 $Estraier::DEBUG = 1;
17
18
19 =head1 NAME
20
21 WebPAC::Output::EstraierNative - Create Hyper Estraier full text index using native bindings
22
23 =head1 VERSION
24
25 Version 0.2
26
27 =cut
28
29 our $VERSION = '0.2';
30
31 =head1 SYNOPSIS
32
33 Create full text index using Hyper Estraier index from data with
34 type C<search>.
35
36 =head1 FUNCTIONS
37
38 =head2 new
39
40 Connect to Hyper Estraier index using HTTP
41
42 my $est = new WebPAC::Output::Estraier(
43 path => 'casket/',
44 database => 'demo',
45 label => 'node label',
46 encoding => 'iso-8859-2',
47 clean => 1,
48 );
49
50 Options are:
51
52 =over 4
53
54 =item path
55
56 full or relative path to Hyper Estraier database
57
58 =item database
59
60 name of database from which data comes
61
62 =item label
63
64 label for node (optional)
65
66 =item encoding
67
68 character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
69 (and it probably is). This encoding will be converted to C<UTF-8> for
70 Hyper Estraier.
71
72 =back
73
74 Name of database will be used to form URI of documents in index.
75
76 =cut
77
78 sub new {
79 my $class = shift;
80 my $self = {@_};
81 bless($self, $class);
82
83 my $log = $self->_get_logger;
84
85 #$log->debug("self: ", sub { dump($self) });
86
87 foreach my $p (qw/path database/) {
88 $log->logdie("need $p") unless ($self->{$p});
89 }
90
91 $self->{encoding} ||= 'ISO-8859-2';
92
93 $self->{label} ||= "WebPAC $self->{database}";
94
95 my $path = 'casket';
96 $path =~ s!/+$!!;
97
98 $path .= '/' . $self->{database};
99
100 $self->{path} = $path;
101
102 $path .= '.tmp';
103 if (-e $path) {
104 rmtree($path) || $log->logdie("can't remove old temporary directory $path: $!");
105 }
106 mkpath($path) || $log->logdie("can't create new temporary directory $path: $!");
107
108 my $db = new Database();
109 unless($db->open($path, Database::DBWRITER | Database::DBCREAT)) {
110 $log->logdie("can't open $path: ", $db->err_msg($db->error()) );
111 }
112
113 $self->{db} = $db;
114
115 $log->info("using ", $self->{clean} ? "new " : "", "index $self->{path} '$self->{label}' with encoding $self->{encoding}");
116
117 $self ? return $self : return undef;
118 }
119
120
121 =head2 add
122
123 Adds one entry to database.
124
125 $est->add(
126 id => 42,
127 ds => $ds,
128 type => 'display',
129 text => 'optional text from which snippet is created',
130 );
131
132 This function will create entries in index using following URI format:
133
134 C<file:///type/database%20name/000>
135
136 Each tag in C<data_structure> with specified C<type> will create one
137 attribute and corresponding hidden text (used for search).
138
139 =cut
140
141 sub add {
142 my $self = shift;
143
144 my $args = {@_};
145
146 my $log = $self->_get_logger;
147
148 my $database = $self->{'database'} || $log->logconfess('no database in $self');
149 $log->logconfess('need db in object') unless ($self->{'db'});
150
151 foreach my $p (qw/id ds type/) {
152 $log->logdie("need $p") unless ($args->{$p});
153 }
154
155 my $type = $args->{'type'};
156 my $id = $args->{'id'};
157
158 my $uri = "file:///$type/$database/$id";
159 $log->debug("creating $uri");
160
161 my $doc = new Document();
162 $doc->add_attr('@uri', $self->convert($uri) );
163
164 # store type and database name
165 $doc->add_attr('_database', $database );
166 $doc->add_hidden_text('_database:' . $database);
167 $doc->add_attr('_type', $type );
168
169 $log->debug("ds = ", sub { dump($args->{'ds'}) } );
170
171 # filter all tags which have type defined
172 my @tags = grep {
173 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
174 } keys %{ $args->{'ds'} };
175
176 $log->debug("tags = ", join(",", @tags));
177
178 return unless (@tags);
179
180 foreach my $tag (@tags) {
181
182 $log->debug("$tag :: $type == ",dump( $args->{'ds'}->{$tag}->{$type} ) );
183
184 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
185
186 next if (! $vals);
187
188 $vals = join(" ") if (ref($vals) eq 'ARRAY');
189
190 $vals = $self->convert( $vals ) or
191 $log->logdie("can't convert '$vals' to UTF-8");
192
193 $doc->add_attr( $tag, $vals );
194 $doc->add_hidden_text( $vals );
195 }
196
197 my $text = $args->{'text'};
198 if ( $text ) {
199 $text = $self->convert( $text ) or
200 $log->logdie("can't convert '$text' to UTF-8");
201 $doc->add_text( $text );
202 }
203
204 $log->debug("adding ", sub { $doc->dump_draft } );
205 $self->{'db'}->put_doc($doc, Database::PDCLEAN) || $log->warn("can't add document $uri with draft " . $doc->dump_draft . " to node " . $self->{path} . " status: " . $self->{db}->error());
206
207 return 1;
208 }
209
210 =head2 add_link
211
212 $est->add_link(
213 from => 'ps',
214 to => 'webpac2',
215 credit => 10000,
216 );
217
218 =cut
219
220 sub add_link {
221 my $self = shift;
222
223 my $args = {@_};
224 my $log = $self->_get_logger;
225
226 $log->warn("add_link is not implemented");
227 return;
228
229 foreach my $p (qw/from to credit/) {
230 $log->logdie("need $p") unless ($args->{$p});
231 }
232
233 my $node = first { $_->{name} eq $args->{to} } $self->master( action => 'nodelist' );
234
235 if (! $node) {
236 $log->warn("can't find node $args->{to}, skipping link creaton");
237 return;
238 }
239
240 my $label = $node->{label};
241
242 if (! $label) {
243 $log->warn("can't find label for $args->{to}, skipping link creaton");
244 return;
245 }
246
247 $log->debug("using label $label for $args->{to}");
248
249 return $self->{db}->set_link(
250 $self->{masterurl} . '/node/' . $args->{to},
251 $label,
252 $args->{credit},
253 );
254 }
255
256
257 =head2 finish
258
259 Close index and rename of to final path
260
261 $est->finish;
262
263 =cut
264
265 sub finish {
266 my $self = shift;
267
268 my $log = $self->_get_logger;
269 $log->info("closing Hyper Estraier index make it current...");
270
271 $self->{db}->close || $log->logdie("can't close index");
272
273 my $path = $self->{path} || $log->logdie("no path?");
274
275 if (-e $path) {
276 $log->warn("removing old $path");
277 rmtree($path) || $log->logdie("can't remove old temporary directory $path: $!");
278 }
279
280 rename $path . '.tmp', $path || $log->logdie("can't rename ${path}.tmp -> $path: $!");
281
282 }
283
284
285 =head2 convert
286
287 my $utf8_string = $self->convert('string in codepage');
288
289 =cut
290
291 sub convert {
292 my $self = shift;
293
294 my $text = shift || return;
295 from_to($text, $self->{encoding}, 'UTF-8');
296 return $text;
297 }
298
299 =head1 AUTHOR
300
301 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
302
303 =head1 COPYRIGHT & LICENSE
304
305 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
306
307 This program is free software; you can redistribute it and/or modify it
308 under the same terms as Perl itself.
309
310 =cut
311
312 1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26