/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory | Revision Log | View Patch Patch

-revision 10 by dpavlin,
Sat Jul 16 20:35:30 2005 UTC
+revision 12 by dpavlin,
Sat Jul 16 22:57:26 2005 UTC
 Line 1
- package WebPAC::Normalise::XML;
+ package WebPAC::Normalize::XML;
  use warnings;
  use strict;
  use base qw/WebPAC::Common/;
  use Storable;
+ use XML::Simple;
+ use Data::Dumper;
  =head1 NAME
- WebPAC::Normalise::XML - apply XML normalisaton rules
+ WebPAC::Normalize::XML - apply XML normalisaton rules
  =head1 VERSION
-Line 23 
 our $VERSION = '0.01';
+Line 25 
 our $VERSION = '0.01';
  This module uses C<conf/normalize/*.xml> files to perform normalisation
  from input records
-     use WebPAC::Normalise::XML;
-     my $foo = WebPAC::Normalise::XML->new();
-     ...
  =cut
- # mapping between data type and tag which specify
- # format in XML file
- my %type2tag = (
-         'isis' => 'isis',
- #       'excel' => 'column',
- #       'marc' => 'marc',
- #       'feed' => 'feed'
- );
- =head1 EXPORT
- A list of functions that can be exported.  You can delete this section
- if you don't export anything, such as for a purely object-oriented module.
  =head1 FUNCTIONS
  =head2 new
- Create new instance of WebPAC using configuration specified by C<config_file>.
+ Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
+ parse it.
   my $n = new WebPAC::Normalize::XML(
+         tag => 'isis',
+         xml_file => '/path/to/conf/normalize/isis.xml',
          cache_data_structure => './cache/ds/',
+         lookup_regex => $lookup->regex,
   }
+ C<tag> defines tag to use within C<xml_file>
+ C<xml_file> defines path to normalize XML.
  Optional parameter C<cache_data_structure> defines path to directory
  in which cache file for C<data_structure> call will be created.
+ Recommended parametar C<lookup_regex> specify ...
  =cut
  sub new {
-Line 67 
 sub new {
+Line 59 
 sub new {
          $self->setup_cache_dir( $self->{'cache_data_structure'} );
-         return $self;
- }
- =head2 open_import_xml
- Read file from C<import_xml/> directory and parse it.
-  $webpac->open_import_xml(type => 'isis');
- =cut
- sub open_import_xml {
-         my $self = shift;
          my $log = $self->_get_logger();
-         my $arg = {@_};
+         foreach my $req (qw/tag xml_file/) {
-         $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
+                 $log->logconfess("need argument $req") unless $self->{$req};
+         }
-         $self->{'type'} = $arg->{'type'};
-         my $type_base = $arg->{'type'};
-         $type_base =~ s/_.*$//g;
-         $self->{'tag'} = $type2tag{$type_base};
+         my $f =
-         $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+         my $xml_file = $self->{'xml_file'};
-         my $f = "./import_xml/".$self->{'type'}.".xml";
+         $log->info("using $xml_file tag <",$self->{'tag'},">");
-         $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
-         $log->info("reading '$f'");
+         $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
-         $self->{'import_xml_file'} = $f;
+         $self->{'import_xml_file'} = $xml_file;
          $self->{'import_xml'} = XMLin($f,
-                 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+                 ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
          );
          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
+         return $self;
  }
  =head2 setup_cache_dir
-Line 153 
 sub setup_cache_dir {
+Line 127 
 sub setup_cache_dir {
  =head2 data_structure
- Create in-memory data structure which represents layout from C<import_xml>.
+ Create in-memory data structure which represents normalized layout from
- It is used later to produce output.
+ C<conf/normalize/*.xml>.
+ This structures are used to produce output.
   my @ds = $webpac->data_structure($rec);
- This method will also set C<$webpac->{'currnet_filename'}> if there is
+ B<Note: historical oddity follows>
- <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
- <headline> tag.
+ This method will also set C<< $webpac->{'currnet_filename'} >> if there is
+ C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
+ C<< <headline> >> tag.
  =cut
-Line 232 
 sub data_structure {
+Line 210 
 sub data_structure {
                          $log->debug("format: $format");
                          my @v;
-                         # FIXME this is a cludge!
+                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
-                         if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
                                  @v = $self->fill_in_to_arr($rec,$format);
                          } else {
                                  @v = $self->parse_to_arr($rec,$format);
-Line 324 
 sub data_structure {
+Line 301 
 sub data_structure {
  }
+ =head2 apply_format
+ Apply format specified in tag with C<format_name="name"> and
+ C<format_delimiter=";;">.
+  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+ Formats can contain C<lookup{...}> if you need them.
+ =cut
+ sub apply_format {
+         my $self = shift;
+         my ($name,$delimiter,$data) = @_;
+         my $log = $self->_get_logger();
+         if (! $self->{'import_xml'}->{'format'}->{$name}) {
+                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+                 return $data;
+         }
+         $log->warn("no delimiter for format $name") if (! $delimiter);
+         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+         my @data = split(/\Q$delimiter\E/, $data);
+         my $out = sprintf($format, @data);
+         $log->debug("using format $name [$format] on $data to produce: $out");
+         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+                 return $self->lookup($out);
+         } else {
+                 return $out;
+         }
+ }
  =head1 AUTHOR
-Line 338 
 under the same terms as Perl itself.
+Line 355 
 under the same terms as Perl itself.
  =cut
-; # End of WebPAC::Normalise::XML
+; # End of WebPAC::Normalize::XML

 Legend:



Removed from v.10
 


changed lines


 
Added in v.12
 Legend:



Removed from v.10
 


changed lines


 
Added in v.12
-Removed from v.10
+Added in v.12

	ViewVC Help
Powered by ViewVC 1.1.26