/[webpac2]/trunk/lib/WebPAC/Output/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1154 - (hide annotations)
Sat Apr 25 10:59:43 2009 UTC (15 years, 1 month ago) by dpavlin
File size: 4116 byte(s)
 r1815@llin:  dpavlin | 2009-04-25 12:59:36 +0200
 implement BW EW tokens to mark boundries of attributes

1 dpavlin 1145 package WebPAC::Output::SWISH;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 1147 use lib 'lib';
7 dpavlin 1145
8 dpavlin 1147 use base qw/WebPAC::Common Class::Accessor/;
9     __PACKAGE__->mk_accessors(qw(
10     database
11     type
12    
13     index_path
14     ));
15    
16 dpavlin 1151 use File::Path qw/mkpath/;
17 dpavlin 1145 use Data::Dump qw/dump/;
18     use YAML;
19 dpavlin 1149 use JSON;
20 dpavlin 1145
21 dpavlin 1147
22 dpavlin 1145 =head1 NAME
23    
24     WebPAC::Output::SWISH - Create swish-e full text index
25    
26     =cut
27    
28     our $VERSION = '0.01';
29    
30     =head1 SYNOPSIS
31    
32     Create full text index using swish-e indexer from data with
33     type C<search>.
34    
35     =head1 FUNCTIONS
36    
37     =head2 new
38    
39 dpavlin 1147 my $out = new WebPAC::Output::SWISH({
40 dpavlin 1145 database => 'demo',
41 dpavlin 1147 });
42 dpavlin 1145
43     Options are:
44    
45     =over 4
46    
47     =item database
48    
49     name of database from which data comes
50    
51     =back
52    
53     Name of database will be used to form URI of documents in index.
54    
55     =cut
56    
57     our $dir = 'var/swish';
58    
59 dpavlin 1147 sub init {
60     my $self = shift;
61 dpavlin 1145
62     my $log = $self->_get_logger;
63    
64 dpavlin 1147 my $database = $self->database || $log->logdie("need database");
65 dpavlin 1145
66 dpavlin 1151 mkpath $dir if ! -e $dir;
67 dpavlin 1145
68     my $path = "$dir/$database.conf";
69    
70     open(my $conf, '>', $path) || die "can't open $path: $!";
71    
72     print $conf <<"DEFAULT_SWISH_CONF";
73     # swish-e config file for $database
74    
75     IndexDir stdin
76    
77     # input file definition
78     DefaultContents XML*
79    
80     # indexed metatags
81     MetaNames xml swishdocpath
82    
83    
84     #XMLClassAttributes type
85     UndefinedMetaTags auto
86     UndefinedXMLAttributes auto
87    
88     IndexFile $dir/$database
89    
90     # Croatian ISO-8859-2 characters to unaccented equivalents
91 dpavlin 1147 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
92 dpavlin 1145
93 dpavlin 1149 # store data into index
94     PropertyNames data
95 dpavlin 1145
96     # disable output
97     ParserWarnLevel 0
98     IndexReport 1
99    
100     DEFAULT_SWISH_CONF
101    
102     close($conf) || die "can't write config $path: $!";
103    
104 dpavlin 1147 $self->index_path( "$dir/$database" );
105 dpavlin 1145
106     my $swish = "swish-e -S prog -c $path";
107 dpavlin 1147 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
108 dpavlin 1145
109 dpavlin 1147 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
110    
111 dpavlin 1145 $self->{stats} = {};
112    
113     $self ? return $self : return undef;
114     }
115    
116     =head2
117    
118     my $path = $out->index_path;
119    
120     =head2 add
121    
122 dpavlin 1147 $out->add( 42, $ds );
123 dpavlin 1145
124     =cut
125    
126     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
127     my $escape_re = join '|' => keys %escape;
128    
129     sub add {
130 dpavlin 1147 my ($self,$id,$ds) = @_;
131 dpavlin 1145
132     my $log = $self->_get_logger;
133 dpavlin 1147 $log->debug("id: $id ds = ",sub { dump($ds) });
134 dpavlin 1145
135 dpavlin 1147 my $database = $self->database || $log->logconfess('no database in $self');
136 dpavlin 1145
137 dpavlin 1147 my $uri = $self->database . "/$id";
138 dpavlin 1145 $log->debug("creating $uri");
139    
140     # filter all tags which have type defined
141 dpavlin 1147 my $type = $self->type || 'search';
142 dpavlin 1145 my @tags = grep {
143 dpavlin 1147 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
144     } keys %{ $ds };
145 dpavlin 1145
146     $log->debug("tags = ", join(",", @tags));
147    
148     return unless (@tags);
149    
150     my $xml = qq{<xml>};
151    
152 dpavlin 1149 my $data;
153    
154 dpavlin 1145 foreach my $tag (@tags) {
155    
156 dpavlin 1147 my $r = ref $ds->{$tag}->{$type};
157     die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
158 dpavlin 1145
159 dpavlin 1147 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
160    
161 dpavlin 1145 next if ! $vals;
162    
163     $vals =~ s/($escape_re)/$escape{$1}/gs;
164 dpavlin 1154 # BW & EW are our markers for tag boundry
165     $xml .= qq{<$tag><![CDATA[BW $vals EW]]></$tag>};
166 dpavlin 1145
167     $self->{stats}->{attr}->{$tag}++;
168 dpavlin 1149
169     $data->{$tag} = $vals;
170 dpavlin 1145 }
171    
172 dpavlin 1149 # serialize to JSON instead of YAML because we will loose whitespace
173     $data = to_json($data);
174     $xml .= qq{<data><![CDATA[$data]]></data>};
175    
176 dpavlin 1145 $xml .= qq{</xml>\n};
177    
178     my $len = length($xml);
179    
180 dpavlin 1147 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
181 dpavlin 1145
182 dpavlin 1147 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
183     die "can't add $uri: $@\n$xml";
184 dpavlin 1145
185 dpavlin 1154 $log->debug( $xml );
186 dpavlin 1149
187 dpavlin 1145 return 1;
188     }
189    
190     =head2 finish
191    
192     Dump attributes used on disk
193    
194     =cut
195    
196     sub finish {
197     my $self = shift;
198     my $log = $self->_get_logger();
199    
200     my $path = $dir . '/' . $self->{database} . '.yaml';
201     YAML::DumpFile( $path, $self->{stats} );
202     $log->info("created $path ", -s $path, " bytes");
203     $log->debug( dump( $self->{stats} ) );
204 dpavlin 1147
205     close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
206 dpavlin 1145 }
207    
208     =head1 AUTHOR
209    
210     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
211    
212     =head1 COPYRIGHT & LICENSE
213    
214     Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
215    
216     This program is free software; you can redistribute it and/or modify it
217     under the same terms as Perl itself.
218    
219     =cut
220    
221     1;

  ViewVC Help
Powered by ViewVC 1.1.26