WebPAC/Normalize/XML.pm

package WebPAC::Normalize::XML;

use warnings;
use strict;

use base qw/WebPAC::Common/;
use Storable;
use XML::Simple;
use Data::Dumper;

=head1 NAME

WebPAC::Normalize::XML - apply XML normalisaton rules

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.xml> files to perform normalisation
from input records

=cut

=head1 FUNCTIONS

=head2 new

Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
parse it.

 my $n = new WebPAC::Normalize::XML(
        tag => 'isis',
        xml_file => '/path/to/conf/normalize/isis.xml',
        cache_data_structure => './cache/ds/',
        lookup_regex => $lookup->regex,
 }

C<tag> defines tag to use within C<xml_file>

C<xml_file> defines path to normalize XML.

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

Recommended parametar C<lookup_regex> specify ... 

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        my $log = $self->_get_logger();

        foreach my $req (qw/tag xml_file/) {
                $log->logconfess("need argument $req") unless $self->{$req};
        }

        my $f = 

        my $xml_file = $self->{'xml_file'};

        $log->info("using $xml_file tag <",$self->{'tag'},">");

        $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);

        $self->{'import_xml_file'} = $xml_file;

        $self->{'import_xml'} = XMLin($f,
                ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
        );

        $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });

        return $self;
}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}

=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize::XML
1	dpavlin	12	package WebPAC::Normalize::XML;
2	dpavlin	8
3			use warnings;
4			use strict;
5
6			use base qw/WebPAC::Common/;
7	dpavlin	10	use Storable;
8	dpavlin	12	use XML::Simple;
9			use Data::Dumper;
10	dpavlin	8
11			=head1 NAME
12
13	dpavlin	12	WebPAC::Normalize::XML - apply XML normalisaton rules
14	dpavlin	8
15			=head1 VERSION
16
17			Version 0.01
18
19			=cut
20
21			our $VERSION = '0.01';
22
23			=head1 SYNOPSIS
24
25			This module uses C<conf/normalize/*.xml> files to perform normalisation
26			from input records
27
28			=cut
29
30			=head1 FUNCTIONS
31
32			=head2 new
33
34	dpavlin	12	Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
35			parse it.
36	dpavlin	8
37			my $n = new WebPAC::Normalize::XML(
38	dpavlin	12	tag => 'isis',
39			xml_file => '/path/to/conf/normalize/isis.xml',
40	dpavlin	8	cache_data_structure => './cache/ds/',
41	dpavlin	12	lookup_regex => $lookup->regex,
42	dpavlin	8	}
43
44	dpavlin	12	C<tag> defines tag to use within C<xml_file>
45
46			C<xml_file> defines path to normalize XML.
47
48	dpavlin	8	Optional parameter C<cache_data_structure> defines path to directory
49			in which cache file for C<data_structure> call will be created.
50
51	dpavlin	12	Recommended parametar C<lookup_regex> specify ...
52
53	dpavlin	8	=cut
54
55			sub new {
56			my $class = shift;
57			my $self = {@_};
58			bless($self, $class);
59
60			$self->setup_cache_dir( $self->{'cache_data_structure'} );
61
62			my $log = $self->_get_logger();
63
64	dpavlin	12	foreach my $req (qw/tag xml_file/) {
65			$log->logconfess("need argument $req") unless $self->{$req};
66			}
67	dpavlin	8
68	dpavlin	12	my $f =
69	dpavlin	8
70	dpavlin	12	my $xml_file = $self->{'xml_file'};
71	dpavlin	8
72	dpavlin	12	$log->info("using $xml_file tag <",$self->{'tag'},">");
73	dpavlin	8
74	dpavlin	12	$log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
75	dpavlin	8
76	dpavlin	12	$self->{'import_xml_file'} = $xml_file;
77	dpavlin	8
78			$self->{'import_xml'} = XMLin($f,
79	dpavlin	12	ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
80	dpavlin	8	);
81
82			$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
83
84	dpavlin	12	return $self;
85	dpavlin	8	}
86
87			=head2 setup_cache_dir
88
89			Check if specified cache directory exist, and if not, disable caching.
90
91			$setup_cache_dir('./cache/ds/');
92
93			If you pass false or zero value to this function, it will disable
94			cacheing.
95
96			=cut
97
98			sub setup_cache_dir {
99			my $self = shift;
100
101			my $dir = shift;
102
103			my $log = $self->_get_logger();
104
105			if ($dir) {
106			my $msg;
107			if (! -e $dir) {
108			$msg = "doesn't exist";
109			} elsif (! -d $dir) {
110			$msg = "is not directory";
111			} elsif (! -w $dir) {
112			$msg = "not writable";
113			}
114
115			if ($msg) {
116			undef $self->{'cache_data_structure'};
117			$log->warn("cache_data_structure $dir $msg, disabling...");
118			} else {
119			$log->debug("using cache dir $dir");
120			}
121			} else {
122			$log->debug("disabling cache");
123			undef $self->{'cache_data_structure'};
124			}
125			}
126
127
128			=head2 data_structure
129
130	dpavlin	12	Create in-memory data structure which represents normalized layout from
131			C<conf/normalize/*.xml>.
132	dpavlin	8
133	dpavlin	12	This structures are used to produce output.
134
135	dpavlin	8	my @ds = $webpac->data_structure($rec);
136
137	dpavlin	12	B<Note: historical oddity follows>
138	dpavlin	8
139	dpavlin	12	This method will also set C<< $webpac->{'currnet_filename'} >> if there is
140			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
141			C<< <headline> >> tag.
142
143	dpavlin	8	=cut
144
145			sub data_structure {
146			my $self = shift;
147
148			my $log = $self->_get_logger();
149
150			my $rec = shift;
151			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
152
153			my $cache_file;
154
155			if (my $cache_path = $self->{'cache_data_structure'}) {
156			my $id = $rec->{'000'};
157			$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
158			unless (defined($id)) {
159			$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
160			undef $self->{'cache_data_structure'};
161			} else {
162			$cache_file = "$cache_path/$id";
163			if (-r $cache_file) {
164			my $ds_ref = retrieve($cache_file);
165			if ($ds_ref) {
166			$log->debug("cache hit: $cache_file");
167			my $ok = 1;
168			foreach my $f (qw(current_filename headline)) {
169			if ($ds_ref->{$f}) {
170			$self->{$f} = $ds_ref->{$f};
171			} else {
172			$ok = 0;
173			}
174			};
175			if ($ok && $ds_ref->{'ds'}) {
176			return @{ $ds_ref->{'ds'} };
177			} else {
178			$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
179			undef $self->{'cache_data_structure'};
180			}
181			}
182			}
183			}
184			}
185
186			undef $self->{'currnet_filename'};
187			undef $self->{'headline'};
188
189			my @sorted_tags;
190			if ($self->{tags_by_order}) {
191			@sorted_tags = @{$self->{tags_by_order}};
192			} else {
193			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
194			$self->{tags_by_order} = \@sorted_tags;
195			}
196
197			my @ds;
198
199			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
200
201			foreach my $field (@sorted_tags) {
202
203			my $row;
204
205			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
206
207			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
208			my $format = $tag->{'value'} \|\| $tag->{'content'};
209
210			$log->debug("format: $format");
211
212			my @v;
213	dpavlin	12	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
214	dpavlin	8	@v = $self->fill_in_to_arr($rec,$format);
215			} else {
216			@v = $self->parse_to_arr($rec,$format);
217			}
218			next if (! @v);
219
220			if ($tag->{'sort'}) {
221			@v = $self->sort_arr(@v);
222			}
223
224			# use format?
225			if ($tag->{'format_name'}) {
226			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
227			}
228
229			if ($field eq 'filename') {
230			$self->{'current_filename'} = join('',@v);
231			$log->debug("filename: ",$self->{'current_filename'});
232			} elsif ($field eq 'headline') {
233			$self->{'headline'} .= join('',@v);
234			$log->debug("headline: ",$self->{'headline'});
235			next; # don't return headline in data_structure!
236			}
237
238			# delimiter will join repeatable fields
239			if ($tag->{'delimiter'}) {
240			@v = ( join($tag->{'delimiter'}, @v) );
241			}
242
243			# default types
244			my @types = qw(display swish);
245			# override by type attribute
246			@types = ( $tag->{'type'} ) if ($tag->{'type'});
247
248			foreach my $type (@types) {
249			# append to previous line?
250			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
251			if ($tag->{'append'}) {
252
253			# I will delimit appended part with
254			# delimiter (or ,)
255			my $d = $tag->{'delimiter'};
256			# default delimiter
257			$d \|\|= " ";
258
259			my $last = pop @{$row->{$type}};
260			$d = "" if (! $last);
261			$last .= $d . join($d, @v);
262			push @{$row->{$type}}, $last;
263
264			} else {
265			push @{$row->{$type}}, @v;
266			}
267			}
268
269
270			}
271
272			if ($row) {
273			$row->{'tag'} = $field;
274
275			# TODO: name_sigular, name_plural
276			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
277			$row->{'name'} = $name ? $self->_x($name) : $field;
278
279			# post-sort all values in field
280			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
281			$log->warn("sort at field tag not implemented");
282			}
283
284			push @ds, $row;
285
286			$log->debug("row $field: ",sub { Dumper($row) });
287			}
288
289			}
290
291			if ($cache_file) {
292			store {
293			ds => \@ds,
294			current_filename => $self->{'current_filename'},
295			headline => $self->{'headline'},
296			}, $cache_file;
297			$log->debug("created storable cache file $cache_file");
298			}
299
300			return @ds;
301
302			}
303
304	dpavlin	12	=head2 apply_format
305	dpavlin	8
306	dpavlin	12	Apply format specified in tag with C<format_name="name"> and
307			C<format_delimiter=";;">.
308
309			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
310
311			Formats can contain C<lookup{...}> if you need them.
312
313			=cut
314
315			sub apply_format {
316			my $self = shift;
317
318			my ($name,$delimiter,$data) = @_;
319
320			my $log = $self->_get_logger();
321
322			if (! $self->{'import_xml'}->{'format'}->{$name}) {
323			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
324			return $data;
325			}
326
327			$log->warn("no delimiter for format $name") if (! $delimiter);
328
329			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
330
331			my @data = split(/\Q$delimiter\E/, $data);
332
333			my $out = sprintf($format, @data);
334			$log->debug("using format $name [$format] on $data to produce: $out");
335
336			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
337			return $self->lookup($out);
338			} else {
339			return $out;
340			}
341
342			}
343
344
345	dpavlin	8	=head1 AUTHOR
346
347			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
348
349			=head1 COPYRIGHT & LICENSE
350
351			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
352
353			This program is free software; you can redistribute it and/or modify it
354			under the same terms as Perl itself.
355
356			=cut
357
358	dpavlin	12	1; # End of WebPAC::Normalize::XML