--- trunk/lib/WebPAC/Normalize/XML.pm 2005/07/16 16:48:35 8 +++ trunk/lib/WebPAC/Normalize/XML.pm 2005/11/12 21:21:50 38 @@ -1,13 +1,16 @@ -package WebPAC::Normalise::XML; +package WebPAC::Normalize::XML; use warnings; use strict; -use base qw/WebPAC::Common/; +use base qw/WebPAC::Common WebPAC::Normalize/; +use XML::Simple; +use Data::Dumper; +use Text::Iconv; =head1 NAME -WebPAC::Normalise::XML - apply XML normalisaton rules +WebPAC::Normalize::XML - apply XML normalisaton rules =head1 VERSION @@ -22,304 +25,78 @@ This module uses C files to perform normalisation from input records - use WebPAC::Normalise::XML; - - my $foo = WebPAC::Normalise::XML->new(); - ... - =cut -# mapping between data type and tag which specify -# format in XML file -my %type2tag = ( - 'isis' => 'isis', -# 'excel' => 'column', -# 'marc' => 'marc', -# 'feed' => 'feed' -); - - -=head1 EXPORT - -A list of functions that can be exported. You can delete this section -if you don't export anything, such as for a purely object-oriented module. - =head1 FUNCTIONS -=head2 new - -Create new instance of WebPAC using configuration specified by C. - - my $n = new WebPAC::Normalize::XML( - cache_data_structure => './cache/ds/', - } - -Optional parameter C defines path to directory -in which cache file for C call will be created. +=head2 open -=cut - -sub new { - my $class = shift; - my $self = {@_}; - bless($self, $class); - - $self->setup_cache_dir( $self->{'cache_data_structure'} ); +Read normalisation rules defined using XML from C and +parse it. - return $self; -} + my $n = new WebPAC::Normalize::XML; + $n->open( + tag => 'isis', + xml_file => '/path/to/conf/normalize/isis.xml', + ); -=head2 open_import_xml +C defines tag to use within C -Read file from C directory and parse it. - - $webpac->open_import_xml(type => 'isis'); +C defines path to normalize XML. =cut -sub open_import_xml { +sub open { my $self = shift; - my $log = $self->_get_logger(); - my $arg = {@_}; - $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); - - $self->{'type'} = $arg->{'type'}; - my $type_base = $arg->{'type'}; - $type_base =~ s/_.*$//g; + my $log = $self->_get_logger(); - $self->{'tag'} = $type2tag{$type_base}; + foreach my $req (qw/tag xml_file/) { + $log->logconfess("need argument $req") unless $arg->{$req}; + } - $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); + $self->{'tag'} = $arg->{'tag'}; + my $xml_file = $arg->{'xml_file'}; - my $f = "./import_xml/".$self->{'type'}.".xml"; - $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); + $log->info("using $xml_file tag <",$self->{'tag'},">"); - $log->info("reading '$f'"); + $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file); - $self->{'import_xml_file'} = $f; + $self->{'import_xml_file'} = $xml_file; - $self->{'import_xml'} = XMLin($f, - ForceArray => [ $self->{'tag'}, 'config', 'format' ], + $self->{'import_xml'} = XMLin($xml_file, + ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ], + ForceContent => 1, ); $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); -} - -=head2 setup_cache_dir - -Check if specified cache directory exist, and if not, disable caching. - - $setup_cache_dir('./cache/ds/'); - -If you pass false or zero value to this function, it will disable -cacheing. - -=cut - -sub setup_cache_dir { - my $self = shift; - - my $dir = shift; - - my $log = $self->_get_logger(); - - if ($dir) { - my $msg; - if (! -e $dir) { - $msg = "doesn't exist"; - } elsif (! -d $dir) { - $msg = "is not directory"; - } elsif (! -w $dir) { - $msg = "not writable"; - } - - if ($msg) { - undef $self->{'cache_data_structure'}; - $log->warn("cache_data_structure $dir $msg, disabling..."); - } else { - $log->debug("using cache dir $dir"); - } - } else { - $log->debug("disabling cache"); - undef $self->{'cache_data_structure'}; - } + return $self; } -=head2 data_structure +=head2 _x -Create in-memory data structure which represents layout from C. -It is used later to produce output. +Convert string from XML UTF-8 encoding to code page defined in C. - my @ds = $webpac->data_structure($rec); + my $text = $n->_x('utf8 text'); -This method will also set C<$webpac->{'currnet_filename'}> if there is - tag in C and C<$webpac->{'headline'}> if there is - tag. +Default application code page is C. You will probably want to +change that when creating new instance of object based on this one. =cut -sub data_structure { +sub _x { my $self = shift; + my $utf8 = shift || return; - my $log = $self->_get_logger(); - - my $rec = shift; - $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); - - my $cache_file; - - if (my $cache_path = $self->{'cache_data_structure'}) { - my $id = $rec->{'000'}; - $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o); - unless (defined($id)) { - $log->warn("Can't use cache_data_structure on records without unique identifier in field 000"); - undef $self->{'cache_data_structure'}; - } else { - $cache_file = "$cache_path/$id"; - if (-r $cache_file) { - my $ds_ref = retrieve($cache_file); - if ($ds_ref) { - $log->debug("cache hit: $cache_file"); - my $ok = 1; - foreach my $f (qw(current_filename headline)) { - if ($ds_ref->{$f}) { - $self->{$f} = $ds_ref->{$f}; - } else { - $ok = 0; - } - }; - if ($ok && $ds_ref->{'ds'}) { - return @{ $ds_ref->{'ds'} }; - } else { - $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!"); - undef $self->{'cache_data_structure'}; - } - } - } - } - } - - undef $self->{'currnet_filename'}; - undef $self->{'headline'}; - - my @sorted_tags; - if ($self->{tags_by_order}) { - @sorted_tags = @{$self->{tags_by_order}}; - } else { - @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}}; - $self->{tags_by_order} = \@sorted_tags; - } - - my @ds; - - $log->debug("tags: ",sub { join(", ",@sorted_tags) }); - - foreach my $field (@sorted_tags) { - - my $row; - -#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); - - foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { - my $format = $tag->{'value'} || $tag->{'content'}; - - $log->debug("format: $format"); - - my @v; - if ($format =~ /$LOOKUP_REGEX/o) { - @v = $self->fill_in_to_arr($rec,$format); - } else { - @v = $self->parse_to_arr($rec,$format); - } - next if (! @v); - - if ($tag->{'sort'}) { - @v = $self->sort_arr(@v); - } - - # use format? - if ($tag->{'format_name'}) { - @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; - } - - if ($field eq 'filename') { - $self->{'current_filename'} = join('',@v); - $log->debug("filename: ",$self->{'current_filename'}); - } elsif ($field eq 'headline') { - $self->{'headline'} .= join('',@v); - $log->debug("headline: ",$self->{'headline'}); - next; # don't return headline in data_structure! - } - - # delimiter will join repeatable fields - if ($tag->{'delimiter'}) { - @v = ( join($tag->{'delimiter'}, @v) ); - } - - # default types - my @types = qw(display swish); - # override by type attribute - @types = ( $tag->{'type'} ) if ($tag->{'type'}); - - foreach my $type (@types) { - # append to previous line? - $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append'); - if ($tag->{'append'}) { - - # I will delimit appended part with - # delimiter (or ,) - my $d = $tag->{'delimiter'}; - # default delimiter - $d ||= " "; - - my $last = pop @{$row->{$type}}; - $d = "" if (! $last); - $last .= $d . join($d, @v); - push @{$row->{$type}}, $last; - - } else { - push @{$row->{$type}}, @v; - } - } - - - } - - if ($row) { - $row->{'tag'} = $field; - - # TODO: name_sigular, name_plural - my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; - $row->{'name'} = $name ? $self->_x($name) : $field; - - # post-sort all values in field - if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { - $log->warn("sort at field tag not implemented"); - } - - push @ds, $row; - - $log->debug("row $field: ",sub { Dumper($row) }); - } - - } - - if ($cache_file) { - store { - ds => \@ds, - current_filename => $self->{'current_filename'}, - headline => $self->{'headline'}, - }, $cache_file; - $log->debug("created storable cache file $cache_file"); - } - - return @ds; + # create UTF-8 convertor for import_xml files + $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2'); + return $self->{'utf2cp'}->convert($utf8) || + $self->_get_logger()->logwarn("can't convert '$utf8'"); } @@ -336,4 +113,4 @@ =cut -1; # End of WebPAC::Normalise::XML +1; # End of WebPAC::Normalize::XML