/[webpac2]/trunk/lib/WebPAC/Output/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1247 - (hide annotations)
Wed Jul 22 12:53:07 2009 UTC (14 years, 10 months ago) by dpavlin
File size: 4573 byte(s)
hopefully fix utf-8 encoding when sending it to swish

- remove accented characters before indexing (FIXME: done anyway by swish?)
- convert data to JSON with utf8 encoding
- remove explicit encode
- use bytes to calculate length

1 dpavlin 1145 package WebPAC::Output::SWISH;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 1147 use lib 'lib';
7 dpavlin 1145
8 dpavlin 1147 use base qw/WebPAC::Common Class::Accessor/;
9     __PACKAGE__->mk_accessors(qw(
10     database
11 dpavlin 1156 input
12 dpavlin 1147 type
13    
14     index_path
15     ));
16    
17 dpavlin 1151 use File::Path qw/mkpath/;
18 dpavlin 1145 use Data::Dump qw/dump/;
19     use YAML;
20 dpavlin 1149 use JSON;
21 dpavlin 1247 #use Encode qw/encode encode_utf8 is_utf8/;
22     use Text::Unaccent::PurePerl qw/unac_string/;
23 dpavlin 1145
24 dpavlin 1147
25 dpavlin 1145 =head1 NAME
26    
27     WebPAC::Output::SWISH - Create swish-e full text index
28    
29     =cut
30    
31     our $VERSION = '0.01';
32    
33     =head1 SYNOPSIS
34    
35     Create full text index using swish-e indexer from data with
36     type C<search>.
37    
38     =head1 FUNCTIONS
39    
40     =head2 new
41    
42 dpavlin 1147 my $out = new WebPAC::Output::SWISH({
43 dpavlin 1145 database => 'demo',
44 dpavlin 1147 });
45 dpavlin 1145
46     Options are:
47    
48     =over 4
49    
50     =item database
51    
52     name of database from which data comes
53    
54     =back
55    
56     Name of database will be used to form URI of documents in index.
57    
58     =cut
59    
60     our $dir = 'var/swish';
61    
62 dpavlin 1147 sub init {
63     my $self = shift;
64 dpavlin 1145
65     my $log = $self->_get_logger;
66    
67 dpavlin 1147 my $database = $self->database || $log->logdie("need database");
68 dpavlin 1145
69 dpavlin 1151 mkpath $dir if ! -e $dir;
70 dpavlin 1145
71     my $path = "$dir/$database.conf";
72    
73     open(my $conf, '>', $path) || die "can't open $path: $!";
74    
75     print $conf <<"DEFAULT_SWISH_CONF";
76     # swish-e config file for $database
77    
78     IndexDir stdin
79    
80     # input file definition
81     DefaultContents XML*
82    
83     # indexed metatags
84     MetaNames xml swishdocpath
85    
86    
87     #XMLClassAttributes type
88     UndefinedMetaTags auto
89     UndefinedXMLAttributes auto
90    
91     IndexFile $dir/$database
92    
93     # Croatian ISO-8859-2 characters to unaccented equivalents
94 dpavlin 1147 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
95 dpavlin 1145
96 dpavlin 1149 # store data into index
97     PropertyNames data
98 dpavlin 1145
99     # disable output
100     ParserWarnLevel 0
101     IndexReport 1
102    
103     DEFAULT_SWISH_CONF
104    
105     close($conf) || die "can't write config $path: $!";
106    
107 dpavlin 1147 $self->index_path( "$dir/$database" );
108 dpavlin 1145
109     my $swish = "swish-e -S prog -c $path";
110 dpavlin 1147 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
111 dpavlin 1145
112 dpavlin 1147 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
113    
114 dpavlin 1145 $self->{stats} = {};
115    
116     $self ? return $self : return undef;
117     }
118    
119     =head2
120    
121     my $path = $out->index_path;
122    
123     =head2 add
124    
125 dpavlin 1147 $out->add( 42, $ds );
126 dpavlin 1145
127     =cut
128    
129     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
130     my $escape_re = join '|' => keys %escape;
131    
132     sub add {
133 dpavlin 1147 my ($self,$id,$ds) = @_;
134 dpavlin 1145
135 dpavlin 1165 die "need input" unless $self->input;
136    
137 dpavlin 1145 my $log = $self->_get_logger;
138 dpavlin 1147 $log->debug("id: $id ds = ",sub { dump($ds) });
139 dpavlin 1145
140 dpavlin 1147 my $database = $self->database || $log->logconfess('no database in $self');
141 dpavlin 1145
142 dpavlin 1156 my $uri = $self->database . '/' . $self->input . "/$id";
143 dpavlin 1145 $log->debug("creating $uri");
144    
145     # filter all tags which have type defined
146 dpavlin 1147 my $type = $self->type || 'search';
147 dpavlin 1145 my @tags = grep {
148 dpavlin 1147 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
149     } keys %{ $ds };
150 dpavlin 1145
151     $log->debug("tags = ", join(",", @tags));
152    
153     return unless (@tags);
154    
155 dpavlin 1161 my $xml = qq{<all>};
156 dpavlin 1167 my $data;
157 dpavlin 1145
158 dpavlin 1167 foreach ( 'database', 'input' ) {
159     $xml .= "<$_>" . $self->$_ . "</$_>";
160     $data->{$_} = $self->$_;
161     }
162 dpavlin 1156
163 dpavlin 1145 foreach my $tag (@tags) {
164    
165 dpavlin 1147 my $r = ref $ds->{$tag}->{$type};
166     die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
167 dpavlin 1145
168 dpavlin 1147 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
169    
170 dpavlin 1145 next if ! $vals;
171    
172     $vals =~ s/($escape_re)/$escape{$1}/gs;
173 dpavlin 1247 $data->{$tag} = $vals;
174     $vals = unac_string( $vals );
175    
176 dpavlin 1154 # BW & EW are our markers for tag boundry
177     $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
178 dpavlin 1247 # $xml .= qq{<!-- } . is_utf8( $vals ) . qq{!>};
179 dpavlin 1145
180     $self->{stats}->{attr}->{$tag}++;
181 dpavlin 1165 $self->{stats}->{input}->{ $self->input }->{$tag}++;
182 dpavlin 1149
183 dpavlin 1145 }
184    
185 dpavlin 1149 # serialize to JSON instead of YAML because we will loose whitespace
186 dpavlin 1247 $data = to_json($data, {utf8=>1});
187 dpavlin 1149 $xml .= qq{<data><![CDATA[$data]]></data>};
188    
189 dpavlin 1161 $xml .= qq{</all>\n};
190 dpavlin 1145
191 dpavlin 1247 # $xml = encode('utf-8', $xml);
192 dpavlin 1179
193 dpavlin 1247 use bytes;
194 dpavlin 1145 my $len = length($xml);
195    
196 dpavlin 1147 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
197 dpavlin 1145
198 dpavlin 1147 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
199     die "can't add $uri: $@\n$xml";
200 dpavlin 1145
201 dpavlin 1154 $log->debug( $xml );
202 dpavlin 1149
203 dpavlin 1145 return 1;
204     }
205    
206     =head2 finish
207    
208     Dump attributes used on disk
209    
210     =cut
211    
212     sub finish {
213     my $self = shift;
214     my $log = $self->_get_logger();
215    
216     my $path = $dir . '/' . $self->{database} . '.yaml';
217     YAML::DumpFile( $path, $self->{stats} );
218     $log->info("created $path ", -s $path, " bytes");
219     $log->debug( dump( $self->{stats} ) );
220 dpavlin 1147
221     close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
222 dpavlin 1145 }
223    
224     =head1 AUTHOR
225    
226     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
227    
228     =head1 COPYRIGHT & LICENSE
229    
230     Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
231    
232     This program is free software; you can redistribute it and/or modify it
233     under the same terms as Perl itself.
234    
235     =cut
236    
237     1;

  ViewVC Help
Powered by ViewVC 1.1.26