--- trunk/lib/WebPAC/Normalize/XML.pm 2005/07/16 20:35:30 10 +++ trunk/lib/WebPAC/Normalize/XML.pm 2005/07/16 22:57:26 12 @@ -1,14 +1,16 @@ -package WebPAC::Normalise::XML; +package WebPAC::Normalize::XML; use warnings; use strict; use base qw/WebPAC::Common/; use Storable; +use XML::Simple; +use Data::Dumper; =head1 NAME -WebPAC::Normalise::XML - apply XML normalisaton rules +WebPAC::Normalize::XML - apply XML normalisaton rules =head1 VERSION @@ -23,41 +25,31 @@ This module uses C files to perform normalisation from input records - use WebPAC::Normalise::XML; - - my $foo = WebPAC::Normalise::XML->new(); - ... - =cut -# mapping between data type and tag which specify -# format in XML file -my %type2tag = ( - 'isis' => 'isis', -# 'excel' => 'column', -# 'marc' => 'marc', -# 'feed' => 'feed' -); - - -=head1 EXPORT - -A list of functions that can be exported. You can delete this section -if you don't export anything, such as for a purely object-oriented module. - =head1 FUNCTIONS =head2 new -Create new instance of WebPAC using configuration specified by C. +Read normalisation rules defined using XML from C and +parse it. my $n = new WebPAC::Normalize::XML( + tag => 'isis', + xml_file => '/path/to/conf/normalize/isis.xml', cache_data_structure => './cache/ds/', + lookup_regex => $lookup->regex, } +C defines tag to use within C + +C defines path to normalize XML. + Optional parameter C defines path to directory in which cache file for C call will be created. +Recommended parametar C specify ... + =cut sub new { @@ -67,47 +59,29 @@ $self->setup_cache_dir( $self->{'cache_data_structure'} ); - return $self; -} - -=head2 open_import_xml - -Read file from C directory and parse it. - - $webpac->open_import_xml(type => 'isis'); - -=cut - -sub open_import_xml { - my $self = shift; - my $log = $self->_get_logger(); - my $arg = {@_}; - $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); - - $self->{'type'} = $arg->{'type'}; - - my $type_base = $arg->{'type'}; - $type_base =~ s/_.*$//g; + foreach my $req (qw/tag xml_file/) { + $log->logconfess("need argument $req") unless $self->{$req}; + } - $self->{'tag'} = $type2tag{$type_base}; + my $f = - $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); + my $xml_file = $self->{'xml_file'}; - my $f = "./import_xml/".$self->{'type'}.".xml"; - $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); + $log->info("using $xml_file tag <",$self->{'tag'},">"); - $log->info("reading '$f'"); + $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file); - $self->{'import_xml_file'} = $f; + $self->{'import_xml_file'} = $xml_file; $self->{'import_xml'} = XMLin($f, - ForceArray => [ $self->{'tag'}, 'config', 'format' ], + ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ], ); $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); + return $self; } =head2 setup_cache_dir @@ -153,14 +127,18 @@ =head2 data_structure -Create in-memory data structure which represents layout from C. -It is used later to produce output. +Create in-memory data structure which represents normalized layout from +C. + +This structures are used to produce output. my @ds = $webpac->data_structure($rec); -This method will also set C<$webpac->{'currnet_filename'}> if there is - tag in C and C<$webpac->{'headline'}> if there is - tag. +B + +This method will also set C<< $webpac->{'currnet_filename'} >> if there is +C<< >> tag and C<< $webpac->{'headline'} >> if there is +C<< >> tag. =cut @@ -232,8 +210,7 @@ $log->debug("format: $format"); my @v; - # FIXME this is a cludge! - if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) { + if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { @v = $self->fill_in_to_arr($rec,$format); } else { @v = $self->parse_to_arr($rec,$format); @@ -324,6 +301,46 @@ } +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->lookup($out); + } else { + return $out; + } + +} + =head1 AUTHOR @@ -338,4 +355,4 @@ =cut -1; # End of WebPAC::Normalise::XML +1; # End of WebPAC::Normalize::XML