9 |
use KinoSearch::Analysis::PolyAnalyzer; |
use KinoSearch::Analysis::PolyAnalyzer; |
10 |
use Encode qw/from_to/; |
use Encode qw/from_to/; |
11 |
use Data::Dumper; |
use Data::Dumper; |
12 |
|
use Storable; |
13 |
|
|
14 |
=head1 NAME |
=head1 NAME |
15 |
|
|
17 |
|
|
18 |
=head1 VERSION |
=head1 VERSION |
19 |
|
|
20 |
Version 0.01 |
Version 0.03 |
21 |
|
|
22 |
=cut |
=cut |
23 |
|
|
24 |
our $VERSION = '0.01'; |
our $VERSION = '0.03'; |
25 |
|
|
26 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
27 |
|
|
90 |
|
|
91 |
$self->{encoding} ||= 'ISO-8859-2'; |
$self->{encoding} ||= 'ISO-8859-2'; |
92 |
|
|
93 |
$log->info("using index $self->{index_path} with encoding $self->{encoding}"); |
$self->{index_path} .= '/' . $self->{database}; |
94 |
|
|
95 |
|
$self->{clean} = 1 if (! -e $self->{index_path} . '/segments'); |
96 |
|
|
97 |
|
$log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}"); |
98 |
|
|
99 |
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); |
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); |
100 |
|
|
104 |
analyzer => $analyzer, |
analyzer => $analyzer, |
105 |
); |
); |
106 |
|
|
107 |
|
my $fields_path = $self->{index_path} . '/fields.storable'; |
108 |
|
$fields_path =~ s#//#/#g; |
109 |
|
if (-e $fields_path) { |
110 |
|
$self->{fields} = retrieve($fields_path) || |
111 |
|
$log->warn("can't open $fields_path: $!"); |
112 |
|
} else { |
113 |
|
$log->error("This will be dummy run since no fields statistics are found!"); |
114 |
|
$log->error("You will have to re-run indexing to get search results!"); |
115 |
|
$self->{dummy_run} = 1; |
116 |
|
} |
117 |
|
$self->{fields_path} = $fields_path; |
118 |
|
|
119 |
foreach my $f (@{ $self->{fields} }) { |
foreach my $f (@{ $self->{fields} }) { |
120 |
$self->{invindex}->spec_field( |
$self->{invindex}->spec_field( |
121 |
name => $f, |
name => $f, |
172 |
|
|
173 |
my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )"); |
my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )"); |
174 |
|
|
175 |
sub add_value($$$$$) { |
sub _add_value($$$$$) { |
176 |
my ($self,$log,$doc,$n,$v) = @_; |
my ($self,$log,$doc,$n,$v) = @_; |
177 |
return unless ($v); |
return unless ($v); |
178 |
|
|
179 |
|
$self->{value_usage}->{$n}++; |
180 |
|
return if ($self->{dummy_run}); |
181 |
|
|
182 |
eval { $doc->set_value($n, $self->convert($v) ) }; |
eval { $doc->set_value($n, $self->convert($v) ) }; |
183 |
$log->warn("can't insert: $n = $v") if ($@); |
$log->warn("can't insert: $n = $v") if ($@); |
184 |
} |
} |
185 |
|
|
186 |
add_value($self,$log,$doc, 'uri', $uri); |
_add_value($self,$log,$doc, 'uri', $uri); |
187 |
|
|
188 |
$log->debug("ds = ", sub { Dumper($args->{'ds'}) } ); |
$log->debug("ds = ", sub { Dumper($args->{'ds'}) } ); |
189 |
|
|
205 |
$vals = $self->convert( $vals ) or |
$vals = $self->convert( $vals ) or |
206 |
$log->logdie("can't convert '$vals' to UTF-8"); |
$log->logdie("can't convert '$vals' to UTF-8"); |
207 |
|
|
208 |
add_value($self, $log, $doc, $tag, $vals ); |
_add_value($self, $log, $doc, $tag, $vals ); |
209 |
} |
} |
210 |
|
|
211 |
if (my $text = $args->{'text'}) { |
if (my $text = $args->{'text'}) { |
212 |
add_value($self, $log, $doc, 'bodytext', $text ); |
_add_value($self, $log, $doc, 'bodytext', $text ); |
213 |
} |
} |
214 |
|
|
215 |
#$log->debug("adding ", sub { $doc->dump_draft } ); |
#$log->debug("adding ", sub { $doc->dump_draft } ); |
229 |
sub finish { |
sub finish { |
230 |
my $self = shift; |
my $self = shift; |
231 |
|
|
232 |
$self->_get_logger()->info("finish index writing to disk"); |
my $log = $self->_get_logger(); |
233 |
|
|
234 |
|
$log->info("finish index writing to disk"); |
235 |
$self->{invindex}->finish; |
$self->{invindex}->finish; |
236 |
|
|
237 |
|
$log->info("writing value usage file"); |
238 |
|
|
239 |
|
# add fields from last run |
240 |
|
map { $self->{value_usage}->{$_}++ } @{ $self->{fields} }; |
241 |
|
|
242 |
|
my @fields = keys %{ $self->{value_usage} }; |
243 |
|
store \@fields, $self->{fields_path} || |
244 |
|
$log->warn("can't write $self->{fields_path}: $!"); |
245 |
|
|
246 |
} |
} |
247 |
|
|
248 |
=head2 convert |
=head2 convert |