--- trunk/lib/WebPAC/Normalize/XML.pm 2005/07/16 22:57:26 12 +++ trunk/lib/WebPAC/Normalize/XML.pm 2005/11/12 21:21:50 38 @@ -3,10 +3,10 @@ use warnings; use strict; -use base qw/WebPAC::Common/; -use Storable; +use base qw/WebPAC::Common WebPAC::Normalize/; use XML::Simple; use Data::Dumper; +use Text::Iconv; =head1 NAME @@ -29,45 +29,36 @@ =head1 FUNCTIONS -=head2 new +=head2 open Read normalisation rules defined using XML from C and parse it. - my $n = new WebPAC::Normalize::XML( + my $n = new WebPAC::Normalize::XML; + $n->open( tag => 'isis', xml_file => '/path/to/conf/normalize/isis.xml', - cache_data_structure => './cache/ds/', - lookup_regex => $lookup->regex, - } + ); C defines tag to use within C C defines path to normalize XML. -Optional parameter C defines path to directory -in which cache file for C call will be created. - -Recommended parametar C specify ... - =cut -sub new { - my $class = shift; - my $self = {@_}; - bless($self, $class); +sub open { + my $self = shift; - $self->setup_cache_dir( $self->{'cache_data_structure'} ); + my $arg = {@_}; my $log = $self->_get_logger(); foreach my $req (qw/tag xml_file/) { - $log->logconfess("need argument $req") unless $self->{$req}; + $log->logconfess("need argument $req") unless $arg->{$req}; } - my $f = - - my $xml_file = $self->{'xml_file'}; + $self->{'tag'} = $arg->{'tag'}; + my $xml_file = $arg->{'xml_file'}; $log->info("using $xml_file tag <",$self->{'tag'},">"); @@ -75,8 +66,9 @@ $self->{'import_xml_file'} = $xml_file; - $self->{'import_xml'} = XMLin($f, + $self->{'import_xml'} = XMLin($xml_file, ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ], + ForceContent => 1, ); $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); @@ -84,261 +76,27 @@ return $self; } -=head2 setup_cache_dir - -Check if specified cache directory exist, and if not, disable caching. - - $setup_cache_dir('./cache/ds/'); - -If you pass false or zero value to this function, it will disable -cacheing. - -=cut - -sub setup_cache_dir { - my $self = shift; - - my $dir = shift; - - my $log = $self->_get_logger(); - - if ($dir) { - my $msg; - if (! -e $dir) { - $msg = "doesn't exist"; - } elsif (! -d $dir) { - $msg = "is not directory"; - } elsif (! -w $dir) { - $msg = "not writable"; - } - - if ($msg) { - undef $self->{'cache_data_structure'}; - $log->warn("cache_data_structure $dir $msg, disabling..."); - } else { - $log->debug("using cache dir $dir"); - } - } else { - $log->debug("disabling cache"); - undef $self->{'cache_data_structure'}; - } -} - - -=head2 data_structure - -Create in-memory data structure which represents normalized layout from -C. - -This structures are used to produce output. - - my @ds = $webpac->data_structure($rec); - -B - -This method will also set C<< $webpac->{'currnet_filename'} >> if there is -C<< >> tag and C<< $webpac->{'headline'} >> if there is -C<< >> tag. - -=cut - -sub data_structure { - my $self = shift; - - my $log = $self->_get_logger(); - - my $rec = shift; - $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); - - my $cache_file; - - if (my $cache_path = $self->{'cache_data_structure'}) { - my $id = $rec->{'000'}; - $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o); - unless (defined($id)) { - $log->warn("Can't use cache_data_structure on records without unique identifier in field 000"); - undef $self->{'cache_data_structure'}; - } else { - $cache_file = "$cache_path/$id"; - if (-r $cache_file) { - my $ds_ref = retrieve($cache_file); - if ($ds_ref) { - $log->debug("cache hit: $cache_file"); - my $ok = 1; - foreach my $f (qw(current_filename headline)) { - if ($ds_ref->{$f}) { - $self->{$f} = $ds_ref->{$f}; - } else { - $ok = 0; - } - }; - if ($ok && $ds_ref->{'ds'}) { - return @{ $ds_ref->{'ds'} }; - } else { - $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!"); - undef $self->{'cache_data_structure'}; - } - } - } - } - } - - undef $self->{'currnet_filename'}; - undef $self->{'headline'}; - - my @sorted_tags; - if ($self->{tags_by_order}) { - @sorted_tags = @{$self->{tags_by_order}}; - } else { - @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}}; - $self->{tags_by_order} = \@sorted_tags; - } - - my @ds; - - $log->debug("tags: ",sub { join(", ",@sorted_tags) }); - foreach my $field (@sorted_tags) { +=head2 _x - my $row; +Convert string from XML UTF-8 encoding to code page defined in C. -#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); + my $text = $n->_x('utf8 text'); - foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { - my $format = $tag->{'value'} || $tag->{'content'}; - - $log->debug("format: $format"); - - my @v; - if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { - @v = $self->fill_in_to_arr($rec,$format); - } else { - @v = $self->parse_to_arr($rec,$format); - } - next if (! @v); - - if ($tag->{'sort'}) { - @v = $self->sort_arr(@v); - } - - # use format? - if ($tag->{'format_name'}) { - @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; - } - - if ($field eq 'filename') { - $self->{'current_filename'} = join('',@v); - $log->debug("filename: ",$self->{'current_filename'}); - } elsif ($field eq 'headline') { - $self->{'headline'} .= join('',@v); - $log->debug("headline: ",$self->{'headline'}); - next; # don't return headline in data_structure! - } - - # delimiter will join repeatable fields - if ($tag->{'delimiter'}) { - @v = ( join($tag->{'delimiter'}, @v) ); - } - - # default types - my @types = qw(display swish); - # override by type attribute - @types = ( $tag->{'type'} ) if ($tag->{'type'}); - - foreach my $type (@types) { - # append to previous line? - $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append'); - if ($tag->{'append'}) { - - # I will delimit appended part with - # delimiter (or ,) - my $d = $tag->{'delimiter'}; - # default delimiter - $d ||= " "; - - my $last = pop @{$row->{$type}}; - $d = "" if (! $last); - $last .= $d . join($d, @v); - push @{$row->{$type}}, $last; - - } else { - push @{$row->{$type}}, @v; - } - } - - - } - - if ($row) { - $row->{'tag'} = $field; - - # TODO: name_sigular, name_plural - my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; - $row->{'name'} = $name ? $self->_x($name) : $field; - - # post-sort all values in field - if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { - $log->warn("sort at field tag not implemented"); - } - - push @ds, $row; - - $log->debug("row $field: ",sub { Dumper($row) }); - } - - } - - if ($cache_file) { - store { - ds => \@ds, - current_filename => $self->{'current_filename'}, - headline => $self->{'headline'}, - }, $cache_file; - $log->debug("created storable cache file $cache_file"); - } - - return @ds; - -} - -=head2 apply_format - -Apply format specified in tag with C and -C. - - my $text = $webpac->apply_format($format_name,$format_delimiter,$data); - -Formats can contain C if you need them. +Default application code page is C. You will probably want to +change that when creating new instance of object based on this one. =cut -sub apply_format { +sub _x { my $self = shift; + my $utf8 = shift || return; - my ($name,$delimiter,$data) = @_; - - my $log = $self->_get_logger(); - - if (! $self->{'import_xml'}->{'format'}->{$name}) { - $log->warn(" is not defined in ",$self->{'import_xml_file'}); - return $data; - } - - $log->warn("no delimiter for format $name") if (! $delimiter); - - my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); - - my @data = split(/\Q$delimiter\E/, $data); - - my $out = sprintf($format, @data); - $log->debug("using format $name [$format] on $data to produce: $out"); - - if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { - return $self->lookup($out); - } else { - return $out; - } + # create UTF-8 convertor for import_xml files + $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2'); + return $self->{'utf2cp'}->convert($utf8) || + $self->_get_logger()->logwarn("can't convert '$utf8'"); }