/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 987 - (hide annotations)
Sun Nov 4 11:58:11 2007 UTC (16 years, 6 months ago) by dpavlin
File size: 3453 byte(s)
 r1513@llin:  dpavlin | 2007-11-04 12:58:09 +0100
 dump field usage counts in debug log

1 dpavlin 431 package WebPAC::Output::KinoSearch;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 914 use base qw/WebPAC::Common WebPAC::Output Class::Accessor/;
7     __PACKAGE__->mk_accessors(qw(
8     path
9     database
10 dpavlin 939 input
11 dpavlin 914 encoding
12     clean
13 dpavlin 431
14 dpavlin 914 index
15     ));
16    
17     use KinoSearch::Simple;
18     use File::Path;
19 dpavlin 949 use Encode qw/decode/;
20 dpavlin 887 use Data::Dump qw/dump/;
21 dpavlin 536 use Storable;
22 dpavlin 431
23     =head1 NAME
24    
25     WebPAC::Output::KinoSearch - Create KinoSearch full text index
26    
27     =head1 VERSION
28    
29 dpavlin 919 Version 0.05
30 dpavlin 431
31     =cut
32    
33 dpavlin 919 our $VERSION = '0.05';
34 dpavlin 431
35     =head1 SYNOPSIS
36    
37     Create full text index using KinoSearch index from data with
38     type C<search>.
39    
40     =head1 FUNCTIONS
41    
42     =head2 new
43    
44     Open KinoSearch index
45    
46 dpavlin 917 my $out = new WebPAC::Output::KinoSearch({
47 dpavlin 914 path => '/path/to/invindex',
48 dpavlin 431 database => 'demo',
49     encoding => 'iso-8859-2',
50     clean => 1,
51 dpavlin 914 });
52 dpavlin 431
53     Options are:
54    
55     =over 4
56    
57 dpavlin 914 =item path
58 dpavlin 431
59     path to KinoSearch index to use
60    
61     =item database
62    
63     name of database from which data comes
64    
65     =item encoding
66    
67     character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
68     (and it probably is). This encoding will be converted to C<UTF-8> for
69     index.
70    
71     =back
72    
73 dpavlin 917 =head2 init
74    
75     $out->init;
76    
77 dpavlin 431 =cut
78    
79 dpavlin 914 sub init {
80     my $self = shift;
81 dpavlin 431
82     my $log = $self->_get_logger;
83    
84 dpavlin 887 #$log->debug("self: ", sub { dump($self) });
85 dpavlin 431
86 dpavlin 914 foreach my $p (qw/path database/) {
87     $log->logdie("need $p") unless ($self->$p);
88 dpavlin 431 }
89    
90 dpavlin 914 # $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
91 dpavlin 431
92 dpavlin 914 $self->encoding( 'ISO-8859-2' ) unless $self->encoding;
93 dpavlin 431
94 dpavlin 939 ## FIXME we shouldn't re-create whole KinoSearch index every time!
95     $self->clean( 1 );
96    
97 dpavlin 914 if ( ! -e $self->path ) {
98     mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
99     $log->info("created ", $self->path);
100 dpavlin 919 } elsif ( $self->clean ) {
101     $log->info("removing existing ", $self->path);
102     rmtree $self->path || $log->logdie("can't remove ", $self->path,": $!");
103     mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
104 dpavlin 914 }
105 dpavlin 610
106 dpavlin 914 my $path = $self->path . '/' . $self->database;
107 dpavlin 431
108 dpavlin 914 $log->info("using index $path with encoding ", $self->encoding);
109 dpavlin 609
110 dpavlin 914 my $index = KinoSearch::Simple->new(
111     path => $path,
112     language => 'en',
113 dpavlin 431 );
114    
115 dpavlin 914 $log->logdie("can't open $path: $!") unless $index;
116 dpavlin 536
117 dpavlin 914 $self->index( $index );
118 dpavlin 431
119     }
120    
121    
122     =head2 add
123    
124 dpavlin 914 Adds one entry
125 dpavlin 431
126 dpavlin 917 $out->add( 42, $ds );
127 dpavlin 431
128     =cut
129    
130     sub add {
131     my $self = shift;
132    
133 dpavlin 914 my ( $id, $ds ) = @_;
134 dpavlin 431
135     my $log = $self->_get_logger;
136 dpavlin 914 $log->logdie("need id") unless defined $id;
137     $log->logdie("need ds") unless $ds;
138 dpavlin 431
139 dpavlin 914 my $hash = $self->ds_to_hash( $ds, 'search' ) || return;
140 dpavlin 431
141 dpavlin 939 $hash->{id} ||= $id;
142 dpavlin 919 $hash->{database} ||= $self->database;
143 dpavlin 939 $hash->{input} ||= $self->input;
144 dpavlin 431
145 dpavlin 924 foreach my $f ( keys %$hash ) {
146     if ( ref($hash->{$f}) eq 'ARRAY' ) {
147     $hash->{$f} = join(' <*> ', @{ $hash->{$f} });
148     }
149 dpavlin 949 # $hash->{$f} = decode( $self->encoding, $hash->{$f} );
150 dpavlin 987 $self->{field_count}->{$f}++;
151 dpavlin 924 }
152    
153 dpavlin 919 $log->debug("add( $id, ", sub { dump($ds) }," ) => ", sub { dump( $hash ) });
154    
155 dpavlin 914 $self->index->add_doc( $hash );
156 dpavlin 431
157 dpavlin 922 $self->{count}++;
158    
159 dpavlin 431 return 1;
160     }
161    
162 dpavlin 434 =head2 finish
163 dpavlin 431
164 dpavlin 434 Close index
165    
166 dpavlin 917 $out->finish;
167 dpavlin 434
168     =cut
169    
170     sub finish {
171     my $self = shift;
172    
173 dpavlin 536 my $log = $self->_get_logger();
174    
175 dpavlin 922 $log->info("indexed ", $self->{count}, " records");
176 dpavlin 536
177 dpavlin 987 $log->debug("field usage: ", dump( $self->{field_count} ));
178    
179 dpavlin 434 }
180    
181 dpavlin 431 =head1 AUTHOR
182    
183     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
184    
185     =head1 COPYRIGHT & LICENSE
186    
187 dpavlin 914 Copyright 2005-2007 Dobrica Pavlinusic, All Rights Reserved.
188 dpavlin 431
189     This program is free software; you can redistribute it and/or modify it
190     under the same terms as Perl itself.
191    
192     =cut
193    
194     1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26