/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 432 - (hide annotations)
Mon Apr 17 16:01:02 2006 UTC (18 years, 1 month ago) by dpavlin
File size: 4378 byte(s)
 r523@llin:  dpavlin | 2006-04-17 17:18:35 +0200
 warn about missing fields when inserting into index

1 dpavlin 431 package WebPAC::Output::KinoSearch;
2    
3     use warnings;
4     use strict;
5    
6     use base qw/WebPAC::Common/;
7    
8     use KinoSearch::InvIndexer;
9     use KinoSearch::Analysis::PolyAnalyzer;
10     use Encode qw/from_to/;
11     use Data::Dumper;
12    
13     =head1 NAME
14    
15     WebPAC::Output::KinoSearch - Create KinoSearch full text index
16    
17     =head1 VERSION
18    
19     Version 0.01
20    
21     =cut
22    
23     our $VERSION = '0.01';
24    
25     =head1 SYNOPSIS
26    
27     Create full text index using KinoSearch index from data with
28     type C<search>.
29    
30     =head1 FUNCTIONS
31    
32     =head2 new
33    
34     Open KinoSearch index
35    
36     my $est = new WebPAC::Output::KinoSearch(
37     index_path => '/path/to/invindex',
38     fields => qw/name of all filelds used/,
39     database => 'demo',
40     label => 'node label',
41     encoding => 'iso-8859-2',
42     clean => 1,
43     );
44    
45     Options are:
46    
47     =over 4
48    
49     =item index_path
50    
51     path to KinoSearch index to use
52    
53     =item fields
54    
55     name of all fields used in this index
56    
57     =item database
58    
59     name of database from which data comes
60    
61     =item label
62    
63     label for node (optional)
64    
65     =item encoding
66    
67     character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
68     (and it probably is). This encoding will be converted to C<UTF-8> for
69     index.
70    
71     =back
72    
73     =cut
74    
75     sub new {
76     my $class = shift;
77     my $self = {@_};
78     bless($self, $class);
79    
80     my $log = $self->_get_logger;
81    
82     #$log->debug("self: ", sub { Dumper($self) });
83    
84     foreach my $p (qw/index_path fields database/) {
85     $log->logdie("need $p") unless ($self->{$p});
86     }
87    
88     $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
89    
90     $self->{encoding} ||= 'ISO-8859-2';
91    
92     $log->info("using index $self->{index_path} with encoding $self->{encoding}");
93    
94     my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
95    
96     $self->{invindex} = KinoSearch::InvIndexer->new(
97     invindex => $self->{index_path},
98     create => $self->{clean},
99     analyzer => $analyzer,
100     );
101    
102     foreach my $f (@{ $self->{fields} }) {
103     $self->{invindex}->spec_field(
104     name => $f,
105     # boost => 10,
106     stored => 1,
107     indexed => 1,
108     vectorized => 0,
109     );
110     }
111    
112     $self ? return $self : return undef;
113     }
114    
115    
116     =head2 add
117    
118     Adds one entry to database.
119    
120     $est->add(
121     id => 42,
122     ds => $ds,
123     type => 'display',
124     text => 'optional text from which snippet is created',
125     );
126    
127     This function will create entries in index using following URI format:
128    
129     C<file:///type/database%20name/000>
130    
131     Each tag in C<data_structure> with specified C<type> will create one
132     attribute and corresponding hidden text (used for search).
133    
134     =cut
135    
136     sub add {
137     my $self = shift;
138    
139     my $args = {@_};
140    
141     my $log = $self->_get_logger;
142    
143     my $database = $self->{'database'} || $log->logconfess('no database in $self');
144     $log->logconfess('need invindex in object') unless ($self->{'invindex'});
145    
146     foreach my $p (qw/id ds type/) {
147     $log->logdie("need $p") unless ($args->{$p});
148     }
149    
150     my $type = $args->{'type'};
151     my $id = $args->{'id'};
152    
153     my $uri = "file:///$type/$database/$id";
154     $log->debug("creating $uri");
155    
156     my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");
157    
158 dpavlin 432 sub add_value($$) {
159     my ($n,$v) = @_;
160     eval { $doc->set_value($n, $self->convert($v) ) };
161     $log->warn("can't insert: $n = $v") if ($@);
162     }
163    
164     add_value('uri', $uri);
165    
166 dpavlin 431 $log->debug("ds = ", sub { Dumper($args->{'ds'}) } );
167    
168     # filter all tags which have type defined
169     my @tags = grep {
170     ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
171     } keys %{ $args->{'ds'} };
172    
173     $log->debug("tags = ", join(",", @tags));
174    
175     return unless (@tags);
176    
177     foreach my $tag (@tags) {
178    
179     my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
180    
181     next if (! $vals);
182    
183     $vals = $self->convert( $vals ) or
184     $log->logdie("can't convert '$vals' to UTF-8");
185    
186 dpavlin 432 add_value( $tag, $vals );
187 dpavlin 431 }
188    
189 dpavlin 432 if (my $text = $args->{'text'}) {
190     add_value( 'bodytext', $text );
191 dpavlin 431 }
192    
193     #$log->debug("adding ", sub { $doc->dump_draft } );
194     $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");
195    
196     return 1;
197     }
198    
199    
200     =head2 convert
201    
202     my $utf8_string = $self->convert('string in codepage');
203    
204     =cut
205    
206     sub convert {
207     my $self = shift;
208    
209     my $text = shift || return;
210     from_to($text, $self->{encoding}, 'UTF-8');
211     return $text;
212     }
213    
214     =head1 AUTHOR
215    
216     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
217    
218     =head1 COPYRIGHT & LICENSE
219    
220     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
221    
222     This program is free software; you can redistribute it and/or modify it
223     under the same terms as Perl itself.
224    
225     =cut
226    
227     1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26