WebPAC/Normalize/XML.pm

package WebPAC::Normalise::XML;

use warnings;
use strict;

use base qw/WebPAC::Common/;
use Storable;

=head1 NAME

WebPAC::Normalise::XML - apply XML normalisaton rules

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.xml> files to perform normalisation
from input records

    use WebPAC::Normalise::XML;

    my $foo = WebPAC::Normalise::XML->new();
    ...

=cut

# mapping between data type and tag which specify
# format in XML file
my %type2tag = (
        'isis' => 'isis',
#       'excel' => 'column',
#       'marc' => 'marc',
#       'feed' => 'feed'
);


=head1 EXPORT

A list of functions that can be exported.  You can delete this section
if you don't export anything, such as for a purely object-oriented module.

=head1 FUNCTIONS

=head2 new

Create new instance of WebPAC using configuration specified by C<config_file>.

 my $n = new WebPAC::Normalize::XML(
        cache_data_structure => './cache/ds/',
 }

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        return $self;
}

=head2 open_import_xml

Read file from C<import_xml/> directory and parse it.

 $webpac->open_import_xml(type => 'isis');

=cut

sub open_import_xml {
        my $self = shift;

        my $log = $self->_get_logger();

        my $arg = {@_};
        $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});

        $self->{'type'} = $arg->{'type'};

        my $type_base = $arg->{'type'};
        $type_base =~ s/_.*$//g;

        $self->{'tag'} = $type2tag{$type_base};

        $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");

        my $f = "./import_xml/".$self->{'type'}.".xml";
        $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");

        $log->info("reading '$f'");

        $self->{'import_xml_file'} = $f;

        $self->{'import_xml'} = XMLin($f,
                ForceArray => [ $self->{'tag'}, 'config', 'format' ],
        );

        $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });

}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents layout from C<import_xml>.
It is used later to produce output.

 my @ds = $webpac->data_structure($rec);

This method will also set C<$webpac->{'currnet_filename'}> if there is
<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
<headline> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        # FIXME this is a cludge!
                        if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalise::XML
1	package WebPAC::Normalise::XML;
2
3	use warnings;
4	use strict;
5
6	use base qw/WebPAC::Common/;
7	use Storable;
8
9	=head1 NAME
10
11	WebPAC::Normalise::XML - apply XML normalisaton rules
12
13	=head1 VERSION
14
15	Version 0.01
16
17	=cut
18
19	our $VERSION = '0.01';
20
21	=head1 SYNOPSIS
22
23	This module uses C<conf/normalize/*.xml> files to perform normalisation
24	from input records
25
26	use WebPAC::Normalise::XML;
27
28	my $foo = WebPAC::Normalise::XML->new();
29	...
30
31	=cut
32
33	# mapping between data type and tag which specify
34	# format in XML file
35	my %type2tag = (
36	'isis' => 'isis',
37	# 'excel' => 'column',
38	# 'marc' => 'marc',
39	# 'feed' => 'feed'
40	);
41
42
43	=head1 EXPORT
44
45	A list of functions that can be exported. You can delete this section
46	if you don't export anything, such as for a purely object-oriented module.
47
48	=head1 FUNCTIONS
49
50	=head2 new
51
52	Create new instance of WebPAC using configuration specified by C<config_file>.
53
54	my $n = new WebPAC::Normalize::XML(
55	cache_data_structure => './cache/ds/',
56	}
57
58	Optional parameter C<cache_data_structure> defines path to directory
59	in which cache file for C<data_structure> call will be created.
60
61	=cut
62
63	sub new {
64	my $class = shift;
65	my $self = {@_};
66	bless($self, $class);
67
68	$self->setup_cache_dir( $self->{'cache_data_structure'} );
69
70	return $self;
71	}
72
73	=head2 open_import_xml
74
75	Read file from C<import_xml/> directory and parse it.
76
77	$webpac->open_import_xml(type => 'isis');
78
79	=cut
80
81	sub open_import_xml {
82	my $self = shift;
83
84	my $log = $self->_get_logger();
85
86	my $arg = {@_};
87	$log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
88
89	$self->{'type'} = $arg->{'type'};
90
91	my $type_base = $arg->{'type'};
92	$type_base =~ s/_.*$//g;
93
94	$self->{'tag'} = $type2tag{$type_base};
95
96	$log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
97
98	my $f = "./import_xml/".$self->{'type'}.".xml";
99	$log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
100
101	$log->info("reading '$f'");
102
103	$self->{'import_xml_file'} = $f;
104
105	$self->{'import_xml'} = XMLin($f,
106	ForceArray => [ $self->{'tag'}, 'config', 'format' ],
107	);
108
109	$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
110
111	}
112
113	=head2 setup_cache_dir
114
115	Check if specified cache directory exist, and if not, disable caching.
116
117	$setup_cache_dir('./cache/ds/');
118
119	If you pass false or zero value to this function, it will disable
120	cacheing.
121
122	=cut
123
124	sub setup_cache_dir {
125	my $self = shift;
126
127	my $dir = shift;
128
129	my $log = $self->_get_logger();
130
131	if ($dir) {
132	my $msg;
133	if (! -e $dir) {
134	$msg = "doesn't exist";
135	} elsif (! -d $dir) {
136	$msg = "is not directory";
137	} elsif (! -w $dir) {
138	$msg = "not writable";
139	}
140
141	if ($msg) {
142	undef $self->{'cache_data_structure'};
143	$log->warn("cache_data_structure $dir $msg, disabling...");
144	} else {
145	$log->debug("using cache dir $dir");
146	}
147	} else {
148	$log->debug("disabling cache");
149	undef $self->{'cache_data_structure'};
150	}
151	}
152
153
154	=head2 data_structure
155
156	Create in-memory data structure which represents layout from C<import_xml>.
157	It is used later to produce output.
158
159	my @ds = $webpac->data_structure($rec);
160
161	This method will also set C<$webpac->{'currnet_filename'}> if there is
162	<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
163	<headline> tag.
164
165	=cut
166
167	sub data_structure {
168	my $self = shift;
169
170	my $log = $self->_get_logger();
171
172	my $rec = shift;
173	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
174
175	my $cache_file;
176
177	if (my $cache_path = $self->{'cache_data_structure'}) {
178	my $id = $rec->{'000'};
179	$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
180	unless (defined($id)) {
181	$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
182	undef $self->{'cache_data_structure'};
183	} else {
184	$cache_file = "$cache_path/$id";
185	if (-r $cache_file) {
186	my $ds_ref = retrieve($cache_file);
187	if ($ds_ref) {
188	$log->debug("cache hit: $cache_file");
189	my $ok = 1;
190	foreach my $f (qw(current_filename headline)) {
191	if ($ds_ref->{$f}) {
192	$self->{$f} = $ds_ref->{$f};
193	} else {
194	$ok = 0;
195	}
196	};
197	if ($ok && $ds_ref->{'ds'}) {
198	return @{ $ds_ref->{'ds'} };
199	} else {
200	$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
201	undef $self->{'cache_data_structure'};
202	}
203	}
204	}
205	}
206	}
207
208	undef $self->{'currnet_filename'};
209	undef $self->{'headline'};
210
211	my @sorted_tags;
212	if ($self->{tags_by_order}) {
213	@sorted_tags = @{$self->{tags_by_order}};
214	} else {
215	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
216	$self->{tags_by_order} = \@sorted_tags;
217	}
218
219	my @ds;
220
221	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
222
223	foreach my $field (@sorted_tags) {
224
225	my $row;
226
227	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
228
229	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
230	my $format = $tag->{'value'} \|\| $tag->{'content'};
231
232	$log->debug("format: $format");
233
234	my @v;
235	# FIXME this is a cludge!
236	if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
237	@v = $self->fill_in_to_arr($rec,$format);
238	} else {
239	@v = $self->parse_to_arr($rec,$format);
240	}
241	next if (! @v);
242
243	if ($tag->{'sort'}) {
244	@v = $self->sort_arr(@v);
245	}
246
247	# use format?
248	if ($tag->{'format_name'}) {
249	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
250	}
251
252	if ($field eq 'filename') {
253	$self->{'current_filename'} = join('',@v);
254	$log->debug("filename: ",$self->{'current_filename'});
255	} elsif ($field eq 'headline') {
256	$self->{'headline'} .= join('',@v);
257	$log->debug("headline: ",$self->{'headline'});
258	next; # don't return headline in data_structure!
259	}
260
261	# delimiter will join repeatable fields
262	if ($tag->{'delimiter'}) {
263	@v = ( join($tag->{'delimiter'}, @v) );
264	}
265
266	# default types
267	my @types = qw(display swish);
268	# override by type attribute
269	@types = ( $tag->{'type'} ) if ($tag->{'type'});
270
271	foreach my $type (@types) {
272	# append to previous line?
273	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
274	if ($tag->{'append'}) {
275
276	# I will delimit appended part with
277	# delimiter (or ,)
278	my $d = $tag->{'delimiter'};
279	# default delimiter
280	$d \|\|= " ";
281
282	my $last = pop @{$row->{$type}};
283	$d = "" if (! $last);
284	$last .= $d . join($d, @v);
285	push @{$row->{$type}}, $last;
286
287	} else {
288	push @{$row->{$type}}, @v;
289	}
290	}
291
292
293	}
294
295	if ($row) {
296	$row->{'tag'} = $field;
297
298	# TODO: name_sigular, name_plural
299	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
300	$row->{'name'} = $name ? $self->_x($name) : $field;
301
302	# post-sort all values in field
303	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
304	$log->warn("sort at field tag not implemented");
305	}
306
307	push @ds, $row;
308
309	$log->debug("row $field: ",sub { Dumper($row) });
310	}
311
312	}
313
314	if ($cache_file) {
315	store {
316	ds => \@ds,
317	current_filename => $self->{'current_filename'},
318	headline => $self->{'headline'},
319	}, $cache_file;
320	$log->debug("created storable cache file $cache_file");
321	}
322
323	return @ds;
324
325	}
326
327
328	=head1 AUTHOR
329
330	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
331
332	=head1 COPYRIGHT & LICENSE
333
334	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
335
336	This program is free software; you can redistribute it and/or modify it
337	under the same terms as Perl itself.
338
339	=cut
340
341	1; # End of WebPAC::Normalise::XML