/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 437 - (show annotations)
Sun Apr 30 19:06:09 2006 UTC (18 years ago) by dpavlin
File size: 4648 byte(s)
 r533@llin:  dpavlin | 2006-04-30 21:06:54 +0200
 fixed warnings

1 package WebPAC::Output::KinoSearch;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7
8 use KinoSearch::InvIndexer;
9 use KinoSearch::Analysis::PolyAnalyzer;
10 use Encode qw/from_to/;
11 use Data::Dumper;
12
13 =head1 NAME
14
15 WebPAC::Output::KinoSearch - Create KinoSearch full text index
16
17 =head1 VERSION
18
19 Version 0.01
20
21 =cut
22
23 our $VERSION = '0.01';
24
25 =head1 SYNOPSIS
26
27 Create full text index using KinoSearch index from data with
28 type C<search>.
29
30 =head1 FUNCTIONS
31
32 =head2 new
33
34 Open KinoSearch index
35
36 my $est = new WebPAC::Output::KinoSearch(
37 index_path => '/path/to/invindex',
38 fields => qw/name of all filelds used/,
39 database => 'demo',
40 label => 'node label',
41 encoding => 'iso-8859-2',
42 clean => 1,
43 );
44
45 Options are:
46
47 =over 4
48
49 =item index_path
50
51 path to KinoSearch index to use
52
53 =item fields
54
55 name of all fields used in this index
56
57 =item database
58
59 name of database from which data comes
60
61 =item label
62
63 label for node (optional)
64
65 =item encoding
66
67 character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
68 (and it probably is). This encoding will be converted to C<UTF-8> for
69 index.
70
71 =back
72
73 =cut
74
75 sub new {
76 my $class = shift;
77 my $self = {@_};
78 bless($self, $class);
79
80 my $log = $self->_get_logger;
81
82 #$log->debug("self: ", sub { Dumper($self) });
83
84 foreach my $p (qw/index_path fields database/) {
85 $log->logdie("need $p") unless ($self->{$p});
86 }
87
88 $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
89
90 $self->{encoding} ||= 'ISO-8859-2';
91
92 $log->info("using index $self->{index_path} with encoding $self->{encoding}");
93
94 my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
95
96 $self->{invindex} = KinoSearch::InvIndexer->new(
97 invindex => $self->{index_path},
98 create => $self->{clean},
99 analyzer => $analyzer,
100 );
101
102 foreach my $f (@{ $self->{fields} }) {
103 $self->{invindex}->spec_field(
104 name => $f,
105 # boost => 10,
106 stored => 1,
107 indexed => 1,
108 vectorized => 0,
109 );
110 }
111
112 $self ? return $self : return undef;
113 }
114
115
116 =head2 add
117
118 Adds one entry to database.
119
120 $est->add(
121 id => 42,
122 ds => $ds,
123 type => 'display',
124 text => 'optional text from which snippet is created',
125 );
126
127 This function will create entries in index using following URI format:
128
129 C<file:///type/database%20name/000>
130
131 Each tag in C<data_structure> with specified C<type> will create one
132 attribute and corresponding hidden text (used for search).
133
134 =cut
135
136 sub add {
137 my $self = shift;
138
139 my $args = {@_};
140
141 my $log = $self->_get_logger;
142
143 my $database = $self->{'database'} || $log->logconfess('no database in $self');
144 $log->logconfess('need invindex in object') unless ($self->{'invindex'});
145
146 foreach my $p (qw/id ds type/) {
147 $log->logdie("need $p") unless ($args->{$p});
148 }
149
150 my $type = $args->{'type'};
151 my $id = $args->{'id'};
152
153 my $uri = "file:///$type/$database/$id";
154 $log->debug("creating $uri");
155
156 my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");
157
158 sub add_value($$$$$) {
159 my ($self,$log,$doc,$n,$v) = @_;
160 return unless ($v);
161 eval { $doc->set_value($n, $self->convert($v) ) };
162 $log->warn("can't insert: $n = $v") if ($@);
163 }
164
165 add_value($self,$log,$doc, 'uri', $uri);
166
167 $log->debug("ds = ", sub { Dumper($args->{'ds'}) } );
168
169 # filter all tags which have type defined
170 my @tags = grep {
171 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
172 } keys %{ $args->{'ds'} };
173
174 $log->debug("tags = ", join(",", @tags));
175
176 return unless (@tags);
177
178 foreach my $tag (@tags) {
179
180 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
181
182 next if (! $vals);
183
184 $vals = $self->convert( $vals ) or
185 $log->logdie("can't convert '$vals' to UTF-8");
186
187 add_value($self, $log, $doc, $tag, $vals );
188 }
189
190 if (my $text = $args->{'text'}) {
191 add_value($self, $log, $doc, 'bodytext', $text );
192 }
193
194 #$log->debug("adding ", sub { $doc->dump_draft } );
195 $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");
196
197 return 1;
198 }
199
200 =head2 finish
201
202 Close index
203
204 $index->finish;
205
206 =cut
207
208 sub finish {
209 my $self = shift;
210
211 $self->_get_logger()->info("finish index writing to disk");
212 $self->{invindex}->finish;
213 }
214
215 =head2 convert
216
217 my $utf8_string = $self->convert('string in codepage');
218
219 =cut
220
221 sub convert {
222 my $self = shift;
223
224 my $text = shift || return;
225 from_to($text, $self->{encoding}, 'UTF-8');
226 return $text;
227 }
228
229 =head1 AUTHOR
230
231 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
232
233 =head1 COPYRIGHT & LICENSE
234
235 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
236
237 This program is free software; you can redistribute it and/or modify it
238 under the same terms as Perl itself.
239
240 =cut
241
242 1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26