/[webpac2]/trunk/lib/WebPAC/Output/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1179 - (hide annotations)
Wed Apr 29 18:14:17 2009 UTC (15 years ago) by dpavlin
File size: 4399 byte(s)
 r1860@llin:  dpavlin | 2009-04-29 20:11:35 +0200
 encode outgoing xml for swish-e in utf-8

1 dpavlin 1145 package WebPAC::Output::SWISH;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 1147 use lib 'lib';
7 dpavlin 1145
8 dpavlin 1147 use base qw/WebPAC::Common Class::Accessor/;
9     __PACKAGE__->mk_accessors(qw(
10     database
11 dpavlin 1156 input
12 dpavlin 1147 type
13    
14     index_path
15     ));
16    
17 dpavlin 1151 use File::Path qw/mkpath/;
18 dpavlin 1145 use Data::Dump qw/dump/;
19     use YAML;
20 dpavlin 1149 use JSON;
21 dpavlin 1179 use Encode qw/encode/;
22 dpavlin 1145
23 dpavlin 1147
24 dpavlin 1145 =head1 NAME
25    
26     WebPAC::Output::SWISH - Create swish-e full text index
27    
28     =cut
29    
30     our $VERSION = '0.01';
31    
32     =head1 SYNOPSIS
33    
34     Create full text index using swish-e indexer from data with
35     type C<search>.
36    
37     =head1 FUNCTIONS
38    
39     =head2 new
40    
41 dpavlin 1147 my $out = new WebPAC::Output::SWISH({
42 dpavlin 1145 database => 'demo',
43 dpavlin 1147 });
44 dpavlin 1145
45     Options are:
46    
47     =over 4
48    
49     =item database
50    
51     name of database from which data comes
52    
53     =back
54    
55     Name of database will be used to form URI of documents in index.
56    
57     =cut
58    
59     our $dir = 'var/swish';
60    
61 dpavlin 1147 sub init {
62     my $self = shift;
63 dpavlin 1145
64     my $log = $self->_get_logger;
65    
66 dpavlin 1147 my $database = $self->database || $log->logdie("need database");
67 dpavlin 1145
68 dpavlin 1151 mkpath $dir if ! -e $dir;
69 dpavlin 1145
70     my $path = "$dir/$database.conf";
71    
72     open(my $conf, '>', $path) || die "can't open $path: $!";
73    
74     print $conf <<"DEFAULT_SWISH_CONF";
75     # swish-e config file for $database
76    
77     IndexDir stdin
78    
79     # input file definition
80     DefaultContents XML*
81    
82     # indexed metatags
83     MetaNames xml swishdocpath
84    
85    
86     #XMLClassAttributes type
87     UndefinedMetaTags auto
88     UndefinedXMLAttributes auto
89    
90     IndexFile $dir/$database
91    
92     # Croatian ISO-8859-2 characters to unaccented equivalents
93 dpavlin 1147 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
94 dpavlin 1145
95 dpavlin 1149 # store data into index
96     PropertyNames data
97 dpavlin 1145
98     # disable output
99     ParserWarnLevel 0
100     IndexReport 1
101    
102     DEFAULT_SWISH_CONF
103    
104     close($conf) || die "can't write config $path: $!";
105    
106 dpavlin 1147 $self->index_path( "$dir/$database" );
107 dpavlin 1145
108     my $swish = "swish-e -S prog -c $path";
109 dpavlin 1147 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
110 dpavlin 1145
111 dpavlin 1147 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
112    
113 dpavlin 1145 $self->{stats} = {};
114    
115     $self ? return $self : return undef;
116     }
117    
118     =head2
119    
120     my $path = $out->index_path;
121    
122     =head2 add
123    
124 dpavlin 1147 $out->add( 42, $ds );
125 dpavlin 1145
126     =cut
127    
128     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
129     my $escape_re = join '|' => keys %escape;
130    
131     sub add {
132 dpavlin 1147 my ($self,$id,$ds) = @_;
133 dpavlin 1145
134 dpavlin 1165 die "need input" unless $self->input;
135    
136 dpavlin 1145 my $log = $self->_get_logger;
137 dpavlin 1147 $log->debug("id: $id ds = ",sub { dump($ds) });
138 dpavlin 1145
139 dpavlin 1147 my $database = $self->database || $log->logconfess('no database in $self');
140 dpavlin 1145
141 dpavlin 1156 my $uri = $self->database . '/' . $self->input . "/$id";
142 dpavlin 1145 $log->debug("creating $uri");
143    
144     # filter all tags which have type defined
145 dpavlin 1147 my $type = $self->type || 'search';
146 dpavlin 1145 my @tags = grep {
147 dpavlin 1147 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
148     } keys %{ $ds };
149 dpavlin 1145
150     $log->debug("tags = ", join(",", @tags));
151    
152     return unless (@tags);
153    
154 dpavlin 1161 my $xml = qq{<all>};
155 dpavlin 1167 my $data;
156 dpavlin 1145
157 dpavlin 1167 foreach ( 'database', 'input' ) {
158     $xml .= "<$_>" . $self->$_ . "</$_>";
159     $data->{$_} = $self->$_;
160     }
161 dpavlin 1156
162 dpavlin 1145 foreach my $tag (@tags) {
163    
164 dpavlin 1147 my $r = ref $ds->{$tag}->{$type};
165     die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
166 dpavlin 1145
167 dpavlin 1147 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
168    
169 dpavlin 1145 next if ! $vals;
170    
171     $vals =~ s/($escape_re)/$escape{$1}/gs;
172 dpavlin 1154 # BW & EW are our markers for tag boundry
173     $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
174 dpavlin 1145
175     $self->{stats}->{attr}->{$tag}++;
176 dpavlin 1165 $self->{stats}->{input}->{ $self->input }->{$tag}++;
177 dpavlin 1149
178     $data->{$tag} = $vals;
179 dpavlin 1145 }
180    
181 dpavlin 1149 # serialize to JSON instead of YAML because we will loose whitespace
182     $data = to_json($data);
183     $xml .= qq{<data><![CDATA[$data]]></data>};
184    
185 dpavlin 1161 $xml .= qq{</all>\n};
186 dpavlin 1145
187 dpavlin 1179 $xml = encode('utf-8', $xml);
188    
189 dpavlin 1145 my $len = length($xml);
190    
191 dpavlin 1147 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
192 dpavlin 1145
193 dpavlin 1147 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
194     die "can't add $uri: $@\n$xml";
195 dpavlin 1145
196 dpavlin 1154 $log->debug( $xml );
197 dpavlin 1149
198 dpavlin 1145 return 1;
199     }
200    
201     =head2 finish
202    
203     Dump attributes used on disk
204    
205     =cut
206    
207     sub finish {
208     my $self = shift;
209     my $log = $self->_get_logger();
210    
211     my $path = $dir . '/' . $self->{database} . '.yaml';
212     YAML::DumpFile( $path, $self->{stats} );
213     $log->info("created $path ", -s $path, " bytes");
214     $log->debug( dump( $self->{stats} ) );
215 dpavlin 1147
216     close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
217 dpavlin 1145 }
218    
219     =head1 AUTHOR
220    
221     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
222    
223     =head1 COPYRIGHT & LICENSE
224    
225     Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
226    
227     This program is free software; you can redistribute it and/or modify it
228     under the same terms as Perl itself.
229    
230     =cut
231    
232     1;

  ViewVC Help
Powered by ViewVC 1.1.26