/[webpac2]/trunk/lib/WebPAC/Output/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Output/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1179 - (show annotations)
Wed Apr 29 18:14:17 2009 UTC (15 years ago) by dpavlin
File size: 4399 byte(s)
 r1860@llin:  dpavlin | 2009-04-29 20:11:35 +0200
 encode outgoing xml for swish-e in utf-8

1 package WebPAC::Output::SWISH;
2
3 use warnings;
4 use strict;
5
6 use lib 'lib';
7
8 use base qw/WebPAC::Common Class::Accessor/;
9 __PACKAGE__->mk_accessors(qw(
10 database
11 input
12 type
13
14 index_path
15 ));
16
17 use File::Path qw/mkpath/;
18 use Data::Dump qw/dump/;
19 use YAML;
20 use JSON;
21 use Encode qw/encode/;
22
23
24 =head1 NAME
25
26 WebPAC::Output::SWISH - Create swish-e full text index
27
28 =cut
29
30 our $VERSION = '0.01';
31
32 =head1 SYNOPSIS
33
34 Create full text index using swish-e indexer from data with
35 type C<search>.
36
37 =head1 FUNCTIONS
38
39 =head2 new
40
41 my $out = new WebPAC::Output::SWISH({
42 database => 'demo',
43 });
44
45 Options are:
46
47 =over 4
48
49 =item database
50
51 name of database from which data comes
52
53 =back
54
55 Name of database will be used to form URI of documents in index.
56
57 =cut
58
59 our $dir = 'var/swish';
60
61 sub init {
62 my $self = shift;
63
64 my $log = $self->_get_logger;
65
66 my $database = $self->database || $log->logdie("need database");
67
68 mkpath $dir if ! -e $dir;
69
70 my $path = "$dir/$database.conf";
71
72 open(my $conf, '>', $path) || die "can't open $path: $!";
73
74 print $conf <<"DEFAULT_SWISH_CONF";
75 # swish-e config file for $database
76
77 IndexDir stdin
78
79 # input file definition
80 DefaultContents XML*
81
82 # indexed metatags
83 MetaNames xml swishdocpath
84
85
86 #XMLClassAttributes type
87 UndefinedMetaTags auto
88 UndefinedXMLAttributes auto
89
90 IndexFile $dir/$database
91
92 # Croatian ISO-8859-2 characters to unaccented equivalents
93 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
94
95 # store data into index
96 PropertyNames data
97
98 # disable output
99 ParserWarnLevel 0
100 IndexReport 1
101
102 DEFAULT_SWISH_CONF
103
104 close($conf) || die "can't write config $path: $!";
105
106 $self->index_path( "$dir/$database" );
107
108 my $swish = "swish-e -S prog -c $path";
109 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
110
111 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
112
113 $self->{stats} = {};
114
115 $self ? return $self : return undef;
116 }
117
118 =head2
119
120 my $path = $out->index_path;
121
122 =head2 add
123
124 $out->add( 42, $ds );
125
126 =cut
127
128 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
129 my $escape_re = join '|' => keys %escape;
130
131 sub add {
132 my ($self,$id,$ds) = @_;
133
134 die "need input" unless $self->input;
135
136 my $log = $self->_get_logger;
137 $log->debug("id: $id ds = ",sub { dump($ds) });
138
139 my $database = $self->database || $log->logconfess('no database in $self');
140
141 my $uri = $self->database . '/' . $self->input . "/$id";
142 $log->debug("creating $uri");
143
144 # filter all tags which have type defined
145 my $type = $self->type || 'search';
146 my @tags = grep {
147 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
148 } keys %{ $ds };
149
150 $log->debug("tags = ", join(",", @tags));
151
152 return unless (@tags);
153
154 my $xml = qq{<all>};
155 my $data;
156
157 foreach ( 'database', 'input' ) {
158 $xml .= "<$_>" . $self->$_ . "</$_>";
159 $data->{$_} = $self->$_;
160 }
161
162 foreach my $tag (@tags) {
163
164 my $r = ref $ds->{$tag}->{$type};
165 die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
166
167 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
168
169 next if ! $vals;
170
171 $vals =~ s/($escape_re)/$escape{$1}/gs;
172 # BW & EW are our markers for tag boundry
173 $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
174
175 $self->{stats}->{attr}->{$tag}++;
176 $self->{stats}->{input}->{ $self->input }->{$tag}++;
177
178 $data->{$tag} = $vals;
179 }
180
181 # serialize to JSON instead of YAML because we will loose whitespace
182 $data = to_json($data);
183 $xml .= qq{<data><![CDATA[$data]]></data>};
184
185 $xml .= qq{</all>\n};
186
187 $xml = encode('utf-8', $xml);
188
189 my $len = length($xml);
190
191 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
192
193 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
194 die "can't add $uri: $@\n$xml";
195
196 $log->debug( $xml );
197
198 return 1;
199 }
200
201 =head2 finish
202
203 Dump attributes used on disk
204
205 =cut
206
207 sub finish {
208 my $self = shift;
209 my $log = $self->_get_logger();
210
211 my $path = $dir . '/' . $self->{database} . '.yaml';
212 YAML::DumpFile( $path, $self->{stats} );
213 $log->info("created $path ", -s $path, " bytes");
214 $log->debug( dump( $self->{stats} ) );
215
216 close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
217 }
218
219 =head1 AUTHOR
220
221 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
222
223 =head1 COPYRIGHT & LICENSE
224
225 Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
226
227 This program is free software; you can redistribute it and/or modify it
228 under the same terms as Perl itself.
229
230 =cut
231
232 1;

  ViewVC Help
Powered by ViewVC 1.1.26