/[webpac2]/trunk/lib/WebPAC/Output/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1147 - (hide annotations)
Thu Apr 23 13:20:51 2009 UTC (15 years ago) by dpavlin
File size: 3817 byte(s)
 r1802@llin:  dpavlin | 2009-04-23 15:20:49 +0200
 more changes to make indexing of WebPAC::Output::SWISH work
 (but not tests yet)

1 dpavlin 1145 package WebPAC::Output::SWISH;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 1147 use lib 'lib';
7 dpavlin 1145
8 dpavlin 1147 use base qw/WebPAC::Common Class::Accessor/;
9     __PACKAGE__->mk_accessors(qw(
10     database
11     type
12    
13     index_path
14     ));
15    
16 dpavlin 1145 use File::Path qw/make_path/;
17     use Data::Dump qw/dump/;
18     use YAML;
19    
20 dpavlin 1147
21 dpavlin 1145 =head1 NAME
22    
23     WebPAC::Output::SWISH - Create swish-e full text index
24    
25     =cut
26    
27     our $VERSION = '0.01';
28    
29     =head1 SYNOPSIS
30    
31     Create full text index using swish-e indexer from data with
32     type C<search>.
33    
34     =head1 FUNCTIONS
35    
36     =head2 new
37    
38 dpavlin 1147 my $out = new WebPAC::Output::SWISH({
39 dpavlin 1145 database => 'demo',
40 dpavlin 1147 });
41 dpavlin 1145
42     Options are:
43    
44     =over 4
45    
46     =item database
47    
48     name of database from which data comes
49    
50     =back
51    
52     Name of database will be used to form URI of documents in index.
53    
54     =cut
55    
56     our $dir = 'var/swish';
57    
58 dpavlin 1147 sub init {
59     my $self = shift;
60 dpavlin 1145
61     my $log = $self->_get_logger;
62    
63 dpavlin 1147 my $database = $self->database || $log->logdie("need database");
64 dpavlin 1145
65 dpavlin 1147 make_path $dir if ! -e $dir;
66 dpavlin 1145
67     my $path = "$dir/$database.conf";
68    
69     open(my $conf, '>', $path) || die "can't open $path: $!";
70    
71     print $conf <<"DEFAULT_SWISH_CONF";
72     # swish-e config file for $database
73    
74     IndexDir stdin
75    
76     # input file definition
77     DefaultContents XML*
78    
79     # indexed metatags
80     MetaNames xml swishdocpath
81    
82    
83     #XMLClassAttributes type
84     UndefinedMetaTags auto
85     UndefinedXMLAttributes auto
86    
87     IndexFile $dir/$database
88    
89     # Croatian ISO-8859-2 characters to unaccented equivalents
90 dpavlin 1147 #TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
91 dpavlin 1145
92    
93     # disable output
94     ParserWarnLevel 0
95     IndexReport 1
96    
97     DEFAULT_SWISH_CONF
98    
99     close($conf) || die "can't write config $path: $!";
100    
101 dpavlin 1147 $self->index_path( "$dir/$database" );
102 dpavlin 1145
103     my $swish = "swish-e -S prog -c $path";
104 dpavlin 1147 open( $self->{_swish_fh}, '|-', $swish ) || die "can't open pipe to $swish: $!";
105 dpavlin 1145
106 dpavlin 1147 $log->info( "created $path ", -s $path, " bytes for ", $self->index_path );
107    
108 dpavlin 1145 $self->{stats} = {};
109    
110     $self ? return $self : return undef;
111     }
112    
113     =head2
114    
115     my $path = $out->index_path;
116    
117     =head2 add
118    
119 dpavlin 1147 $out->add( 42, $ds );
120 dpavlin 1145
121     =cut
122    
123     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
124     my $escape_re = join '|' => keys %escape;
125    
126     sub add {
127 dpavlin 1147 my ($self,$id,$ds) = @_;
128 dpavlin 1145
129     my $log = $self->_get_logger;
130 dpavlin 1147 $log->debug("id: $id ds = ",sub { dump($ds) });
131 dpavlin 1145
132 dpavlin 1147 my $database = $self->database || $log->logconfess('no database in $self');
133 dpavlin 1145
134 dpavlin 1147 my $uri = $self->database . "/$id";
135 dpavlin 1145 $log->debug("creating $uri");
136    
137     # filter all tags which have type defined
138 dpavlin 1147 my $type = $self->type || 'search';
139 dpavlin 1145 my @tags = grep {
140 dpavlin 1147 ref($ds->{$_}) eq 'HASH' && defined( $ds->{$_}->{$type} )
141     } keys %{ $ds };
142 dpavlin 1145
143     $log->debug("tags = ", join(",", @tags));
144    
145     return unless (@tags);
146    
147     my $xml = qq{<xml>};
148    
149     foreach my $tag (@tags) {
150    
151 dpavlin 1147 my $r = ref $ds->{$tag}->{$type};
152     die "tag $tag type $type not ARRAY but '$r' = ",dump( $ds->{$tag}->{$type} ) unless $r eq 'ARRAY';
153 dpavlin 1145
154 dpavlin 1147 my $vals = join(" ", @{ $ds->{$tag}->{$type} });
155    
156 dpavlin 1145 next if ! $vals;
157    
158     $vals =~ s/($escape_re)/$escape{$1}/gs;
159     $xml .= qq{<$tag><![CDATA[$vals]]></$tag>};
160    
161     $self->{stats}->{attr}->{$tag}++;
162     }
163    
164     $xml .= qq{</xml>\n};
165    
166     my $len = length($xml);
167    
168 dpavlin 1147 my $fh = $self->{_swish_fh} || die "_swish_fh missing";
169 dpavlin 1145
170 dpavlin 1147 print $fh "Path-Name: $uri\nContent-Length: $len\nDocument-Type: XML\n\n$xml" or
171     die "can't add $uri: $@\n$xml";
172 dpavlin 1145
173     return 1;
174     }
175    
176     =head2 finish
177    
178     Dump attributes used on disk
179    
180     =cut
181    
182     sub finish {
183     my $self = shift;
184     my $log = $self->_get_logger();
185    
186     my $path = $dir . '/' . $self->{database} . '.yaml';
187     YAML::DumpFile( $path, $self->{stats} );
188     $log->info("created $path ", -s $path, " bytes");
189     $log->debug( dump( $self->{stats} ) );
190 dpavlin 1147
191     close( $self->{_swish_fh} ) || die "can't close index ", $self->index_path, ": $!";
192 dpavlin 1145 }
193    
194     =head1 AUTHOR
195    
196     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
197    
198     =head1 COPYRIGHT & LICENSE
199    
200     Copyright 2004-2009 Dobrica Pavlinusic, All Rights Reserved.
201    
202     This program is free software; you can redistribute it and/or modify it
203     under the same terms as Perl itself.
204    
205     =cut
206    
207     1;

  ViewVC Help
Powered by ViewVC 1.1.26