/[webpac2]/trunk/lib/WebPAC/Output/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1161 - (hide annotations)
Sat Apr 25 14:18:06 2009 UTC (15 years ago) by dpavlin
File size: 4216 byte(s)
rename top-level tag to all so we can have quieries
like (all=something) in swish-e

1 dpavlin 1145 package WebPAC::Output::SWISH;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 1147 use lib 'lib';
7 dpavlin 1145
8 dpavlin 1147 use base qw/WebPAC::Common Class::Accessor/;
9     __PACKAGE__->mk_accessors(qw(
10     database
11 dpavlin 1156 input
12 dpavlin 1147 type
13    
14     index_path
15     ));
16    
17 dpavlin 1151 use File::Path qw/mkpath/;
18 dpavlin 1145 use Data::Dump qw/dump/;
19     use YAML;
20 dpavlin 1149 use JSON;
21 dpavlin 1145
22 dpavlin 1147
23 dpavlin 1145 =head1 NAME
24    
25     WebPAC::Output::SWISH - Create swish-e full text index
26    
27     =cut
28    
29     our $VERSION = '0.01';
30    
31     =head1 SYNOPSIS
32    
33     Create full text index using swish-e indexer from data with
34     type C<search>.
35    
36     =head1 FUNCTIONS
37    
38     =head2 new
39    
40 dpavlin 1147 my $out = new WebPAC::Output::SWISH({
41 dpavlin 1145 database => 'demo',
42 dpavlin 1147 });
43 dpavlin 1145
44     Options are:
45    
46     =over 4
47    
48     =item database
49    
50     name of database from which data comes
51    
52     =back
53    
54     Name of database will be used to form URI of documents in index.
55    
56     =cut
57    
58     our $dir = 'var/swish';
59    
60 dpavlin 1147 sub init {
61     my $self = shift;
62 dpavlin 1145
63     my $log = $self->_get_logger;
64    
65 dpavlin 1147 my $database = $self->database || $log->logdie("need database");
66 dpavlin 1145
67 dpavlin 1151 mkpath $dir if ! -e $dir;
68 dpavlin 1145
69     my $path = "$dir/$database.conf";
70    
71     open(my $conf, '>', $path) || die "can't open $path: $!";
72    
73     print $conf <<"DEFAULT_SWISH_CONF";
74     # swish-e config file for $database
75    
76     IndexDir stdin
77    
78     # input file definition
79     DefaultContents XML*
80    
81     # indexed metatags
82     MetaNames xml swishdocpath
83    
84    
85     #XMLClassAttributes type
86     UndefinedMetaTags auto
87     UndefinedXMLAttributes auto
88    
89     IndexFile $dir/$database
90    
91     # Croatian ISO-8859-2 characters to unaccented equivalents
92 dpavlin 1147 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
93 dpavlin 1145
94 dpavlin 1149 # store data into index
95     PropertyNames data
96 dpavlin 1145
97     # disable output
98     ParserWarnLevel 0
99     IndexReport 1
100    
101     DEFAULT_SWISH_CONF
102    
103     close($conf) || die "can't write config $path: $!";
104    
105 dpavlin 1147 $self->index_path( "$dir/$database" );
106 dpavlin 1145
107     my $swish = "swish-e -S prog -c $path";
108 dpavlin 1147 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
109 dpavlin 1145
110 dpavlin 1147 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
111    
112 dpavlin 1145 $self->{stats} = {};
113    
114     $self ? return $self : return undef;
115     }
116    
117     =head2
118    
119     my $path = $out->index_path;
120    
121     =head2 add
122    
123 dpavlin 1147 $out->add( 42, $ds );
124 dpavlin 1145
125     =cut
126    
127     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
128     my $escape_re = join '|' => keys %escape;
129    
130     sub add {
131 dpavlin 1147 my ($self,$id,$ds) = @_;
132 dpavlin 1145
133     my $log = $self->_get_logger;
134 dpavlin 1147 $log->debug("id: $id ds = ",sub { dump($ds) });
135 dpavlin 1145
136 dpavlin 1147 my $database = $self->database || $log->logconfess('no database in $self');
137 dpavlin 1145
138 dpavlin 1156 my $uri = $self->database . '/' . $self->input . "/$id";
139 dpavlin 1145 $log->debug("creating $uri");
140    
141     # filter all tags which have type defined
142 dpavlin 1147 my $type = $self->type || 'search';
143 dpavlin 1145 my @tags = grep {
144 dpavlin 1147 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
145     } keys %{ $ds };
146 dpavlin 1145
147     $log->debug("tags = ", join(",", @tags));
148    
149     return unless (@tags);
150    
151 dpavlin 1161 my $xml = qq{<all>};
152 dpavlin 1145
153 dpavlin 1156 $xml .= "<$_>" . $self->$_ . "</$_>" foreach ( 'database', 'input' );
154    
155 dpavlin 1149 my $data;
156    
157 dpavlin 1145 foreach my $tag (@tags) {
158    
159 dpavlin 1147 my $r = ref $ds->{$tag}->{$type};
160     die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
161 dpavlin 1145
162 dpavlin 1147 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
163    
164 dpavlin 1145 next if ! $vals;
165    
166     $vals =~ s/($escape_re)/$escape{$1}/gs;
167 dpavlin 1154 # BW & EW are our markers for tag boundry
168     $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
169 dpavlin 1145
170     $self->{stats}->{attr}->{$tag}++;
171 dpavlin 1149
172     $data->{$tag} = $vals;
173 dpavlin 1145 }
174    
175 dpavlin 1149 # serialize to JSON instead of YAML because we will loose whitespace
176     $data = to_json($data);
177     $xml .= qq{<data><![CDATA[$data]]></data>};
178    
179 dpavlin 1161 $xml .= qq{</all>\n};
180 dpavlin 1145
181     my $len = length($xml);
182    
183 dpavlin 1147 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
184 dpavlin 1145
185 dpavlin 1147 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
186     die "can't add $uri: $@\n$xml";
187 dpavlin 1145
188 dpavlin 1154 $log->debug( $xml );
189 dpavlin 1149
190 dpavlin 1145 return 1;
191     }
192    
193     =head2 finish
194    
195     Dump attributes used on disk
196    
197     =cut
198    
199     sub finish {
200     my $self = shift;
201     my $log = $self->_get_logger();
202    
203     my $path = $dir . '/' . $self->{database} . '.yaml';
204     YAML::DumpFile( $path, $self->{stats} );
205     $log->info("created $path ", -s $path, " bytes");
206     $log->debug( dump( $self->{stats} ) );
207 dpavlin 1147
208     close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
209 dpavlin 1145 }
210    
211     =head1 AUTHOR
212    
213     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
214    
215     =head1 COPYRIGHT & LICENSE
216    
217     Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
218    
219     This program is free software; you can redistribute it and/or modify it
220     under the same terms as Perl itself.
221    
222     =cut
223    
224     1;

  ViewVC Help
Powered by ViewVC 1.1.26