WebPAC/Normalize/XML.pm

package WebPAC::Normalise::XML;

use warnings;
use strict;

use base qw/WebPAC::Common/;

=head1 NAME

WebPAC::Normalise::XML - apply XML normalisaton rules

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.xml> files to perform normalisation
from input records

    use WebPAC::Normalise::XML;

    my $foo = WebPAC::Normalise::XML->new();
    ...

=cut

# mapping between data type and tag which specify
# format in XML file
my %type2tag = (
        'isis' => 'isis',
#       'excel' => 'column',
#       'marc' => 'marc',
#       'feed' => 'feed'
);


=head1 EXPORT

A list of functions that can be exported.  You can delete this section
if you don't export anything, such as for a purely object-oriented module.

=head1 FUNCTIONS

=head2 new

Create new instance of WebPAC using configuration specified by C<config_file>.

 my $n = new WebPAC::Normalize::XML(
        cache_data_structure => './cache/ds/',
 }

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        return $self;
}

=head2 open_import_xml

Read file from C<import_xml/> directory and parse it.

 $webpac->open_import_xml(type => 'isis');

=cut

sub open_import_xml {
        my $self = shift;

        my $log = $self->_get_logger();

        my $arg = {@_};
        $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});

        $self->{'type'} = $arg->{'type'};

        my $type_base = $arg->{'type'};
        $type_base =~ s/_.*$//g;

        $self->{'tag'} = $type2tag{$type_base};

        $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");

        my $f = "./import_xml/".$self->{'type'}.".xml";
        $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");

        $log->info("reading '$f'");

        $self->{'import_xml_file'} = $f;

        $self->{'import_xml'} = XMLin($f,
                ForceArray => [ $self->{'tag'}, 'config', 'format' ],
        );

        $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });

}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents layout from C<import_xml>.
It is used later to produce output.

 my @ds = $webpac->data_structure($rec);

This method will also set C<$webpac->{'currnet_filename'}> if there is
<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
<headline> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($format =~ /$LOOKUP_REGEX/o) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalise::XML
1	package WebPAC::Normalise::XML;
2
3	use warnings;
4	use strict;
5
6	use base qw/WebPAC::Common/;
7
8	=head1 NAME
9
10	WebPAC::Normalise::XML - apply XML normalisaton rules
11
12	=head1 VERSION
13
14	Version 0.01
15
16	=cut
17
18	our $VERSION = '0.01';
19
20	=head1 SYNOPSIS
21
22	This module uses C<conf/normalize/*.xml> files to perform normalisation
23	from input records
24
25	use WebPAC::Normalise::XML;
26
27	my $foo = WebPAC::Normalise::XML->new();
28	...
29
30	=cut
31
32	# mapping between data type and tag which specify
33	# format in XML file
34	my %type2tag = (
35	'isis' => 'isis',
36	# 'excel' => 'column',
37	# 'marc' => 'marc',
38	# 'feed' => 'feed'
39	);
40
41
42	=head1 EXPORT
43
44	A list of functions that can be exported. You can delete this section
45	if you don't export anything, such as for a purely object-oriented module.
46
47	=head1 FUNCTIONS
48
49	=head2 new
50
51	Create new instance of WebPAC using configuration specified by C<config_file>.
52
53	my $n = new WebPAC::Normalize::XML(
54	cache_data_structure => './cache/ds/',
55	}
56
57	Optional parameter C<cache_data_structure> defines path to directory
58	in which cache file for C<data_structure> call will be created.
59
60	=cut
61
62	sub new {
63	my $class = shift;
64	my $self = {@_};
65	bless($self, $class);
66
67	$self->setup_cache_dir( $self->{'cache_data_structure'} );
68
69	return $self;
70	}
71
72	=head2 open_import_xml
73
74	Read file from C<import_xml/> directory and parse it.
75
76	$webpac->open_import_xml(type => 'isis');
77
78	=cut
79
80	sub open_import_xml {
81	my $self = shift;
82
83	my $log = $self->_get_logger();
84
85	my $arg = {@_};
86	$log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
87
88	$self->{'type'} = $arg->{'type'};
89
90	my $type_base = $arg->{'type'};
91	$type_base =~ s/_.*$//g;
92
93	$self->{'tag'} = $type2tag{$type_base};
94
95	$log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
96
97	my $f = "./import_xml/".$self->{'type'}.".xml";
98	$log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
99
100	$log->info("reading '$f'");
101
102	$self->{'import_xml_file'} = $f;
103
104	$self->{'import_xml'} = XMLin($f,
105	ForceArray => [ $self->{'tag'}, 'config', 'format' ],
106	);
107
108	$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
109
110	}
111
112	=head2 setup_cache_dir
113
114	Check if specified cache directory exist, and if not, disable caching.
115
116	$setup_cache_dir('./cache/ds/');
117
118	If you pass false or zero value to this function, it will disable
119	cacheing.
120
121	=cut
122
123	sub setup_cache_dir {
124	my $self = shift;
125
126	my $dir = shift;
127
128	my $log = $self->_get_logger();
129
130	if ($dir) {
131	my $msg;
132	if (! -e $dir) {
133	$msg = "doesn't exist";
134	} elsif (! -d $dir) {
135	$msg = "is not directory";
136	} elsif (! -w $dir) {
137	$msg = "not writable";
138	}
139
140	if ($msg) {
141	undef $self->{'cache_data_structure'};
142	$log->warn("cache_data_structure $dir $msg, disabling...");
143	} else {
144	$log->debug("using cache dir $dir");
145	}
146	} else {
147	$log->debug("disabling cache");
148	undef $self->{'cache_data_structure'};
149	}
150	}
151
152
153	=head2 data_structure
154
155	Create in-memory data structure which represents layout from C<import_xml>.
156	It is used later to produce output.
157
158	my @ds = $webpac->data_structure($rec);
159
160	This method will also set C<$webpac->{'currnet_filename'}> if there is
161	<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
162	<headline> tag.
163
164	=cut
165
166	sub data_structure {
167	my $self = shift;
168
169	my $log = $self->_get_logger();
170
171	my $rec = shift;
172	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
173
174	my $cache_file;
175
176	if (my $cache_path = $self->{'cache_data_structure'}) {
177	my $id = $rec->{'000'};
178	$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
179	unless (defined($id)) {
180	$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
181	undef $self->{'cache_data_structure'};
182	} else {
183	$cache_file = "$cache_path/$id";
184	if (-r $cache_file) {
185	my $ds_ref = retrieve($cache_file);
186	if ($ds_ref) {
187	$log->debug("cache hit: $cache_file");
188	my $ok = 1;
189	foreach my $f (qw(current_filename headline)) {
190	if ($ds_ref->{$f}) {
191	$self->{$f} = $ds_ref->{$f};
192	} else {
193	$ok = 0;
194	}
195	};
196	if ($ok && $ds_ref->{'ds'}) {
197	return @{ $ds_ref->{'ds'} };
198	} else {
199	$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
200	undef $self->{'cache_data_structure'};
201	}
202	}
203	}
204	}
205	}
206
207	undef $self->{'currnet_filename'};
208	undef $self->{'headline'};
209
210	my @sorted_tags;
211	if ($self->{tags_by_order}) {
212	@sorted_tags = @{$self->{tags_by_order}};
213	} else {
214	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
215	$self->{tags_by_order} = \@sorted_tags;
216	}
217
218	my @ds;
219
220	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
221
222	foreach my $field (@sorted_tags) {
223
224	my $row;
225
226	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
227
228	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
229	my $format = $tag->{'value'} \|\| $tag->{'content'};
230
231	$log->debug("format: $format");
232
233	my @v;
234	if ($format =~ /$LOOKUP_REGEX/o) {
235	@v = $self->fill_in_to_arr($rec,$format);
236	} else {
237	@v = $self->parse_to_arr($rec,$format);
238	}
239	next if (! @v);
240
241	if ($tag->{'sort'}) {
242	@v = $self->sort_arr(@v);
243	}
244
245	# use format?
246	if ($tag->{'format_name'}) {
247	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
248	}
249
250	if ($field eq 'filename') {
251	$self->{'current_filename'} = join('',@v);
252	$log->debug("filename: ",$self->{'current_filename'});
253	} elsif ($field eq 'headline') {
254	$self->{'headline'} .= join('',@v);
255	$log->debug("headline: ",$self->{'headline'});
256	next; # don't return headline in data_structure!
257	}
258
259	# delimiter will join repeatable fields
260	if ($tag->{'delimiter'}) {
261	@v = ( join($tag->{'delimiter'}, @v) );
262	}
263
264	# default types
265	my @types = qw(display swish);
266	# override by type attribute
267	@types = ( $tag->{'type'} ) if ($tag->{'type'});
268
269	foreach my $type (@types) {
270	# append to previous line?
271	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
272	if ($tag->{'append'}) {
273
274	# I will delimit appended part with
275	# delimiter (or ,)
276	my $d = $tag->{'delimiter'};
277	# default delimiter
278	$d \|\|= " ";
279
280	my $last = pop @{$row->{$type}};
281	$d = "" if (! $last);
282	$last .= $d . join($d, @v);
283	push @{$row->{$type}}, $last;
284
285	} else {
286	push @{$row->{$type}}, @v;
287	}
288	}
289
290
291	}
292
293	if ($row) {
294	$row->{'tag'} = $field;
295
296	# TODO: name_sigular, name_plural
297	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
298	$row->{'name'} = $name ? $self->_x($name) : $field;
299
300	# post-sort all values in field
301	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
302	$log->warn("sort at field tag not implemented");
303	}
304
305	push @ds, $row;
306
307	$log->debug("row $field: ",sub { Dumper($row) });
308	}
309
310	}
311
312	if ($cache_file) {
313	store {
314	ds => \@ds,
315	current_filename => $self->{'current_filename'},
316	headline => $self->{'headline'},
317	}, $cache_file;
318	$log->debug("created storable cache file $cache_file");
319	}
320
321	return @ds;
322
323	}
324
325
326	=head1 AUTHOR
327
328	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
329
330	=head1 COPYRIGHT & LICENSE
331
332	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
333
334	This program is free software; you can redistribute it and/or modify it
335	under the same terms as Perl itself.
336
337	=cut
338
339	1; # End of WebPAC::Normalise::XML