/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 10 by dpavlin, Sat Jul 16 20:35:30 2005 UTC revision 12 by dpavlin, Sat Jul 16 22:57:26 2005 UTC
# Line 1  Line 1 
1  package WebPAC::Normalise::XML;  package WebPAC::Normalize::XML;
2    
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  use base qw/WebPAC::Common/;  use base qw/WebPAC::Common/;
7  use Storable;  use Storable;
8    use XML::Simple;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Normalise::XML - apply XML normalisaton rules  WebPAC::Normalize::XML - apply XML normalisaton rules
14    
15  =head1 VERSION  =head1 VERSION
16    
# Line 23  our $VERSION = '0.01'; Line 25  our $VERSION = '0.01';
25  This module uses C<conf/normalize/*.xml> files to perform normalisation  This module uses C<conf/normalize/*.xml> files to perform normalisation
26  from input records  from input records
27    
     use WebPAC::Normalise::XML;  
   
     my $foo = WebPAC::Normalise::XML->new();  
     ...  
   
28  =cut  =cut
29    
 # mapping between data type and tag which specify  
 # format in XML file  
 my %type2tag = (  
         'isis' => 'isis',  
 #       'excel' => 'column',  
 #       'marc' => 'marc',  
 #       'feed' => 'feed'  
 );  
   
   
 =head1 EXPORT  
   
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
   
30  =head1 FUNCTIONS  =head1 FUNCTIONS
31    
32  =head2 new  =head2 new
33    
34  Create new instance of WebPAC using configuration specified by C<config_file>.  Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
35    parse it.
36    
37   my $n = new WebPAC::Normalize::XML(   my $n = new WebPAC::Normalize::XML(
38            tag => 'isis',
39            xml_file => '/path/to/conf/normalize/isis.xml',
40          cache_data_structure => './cache/ds/',          cache_data_structure => './cache/ds/',
41            lookup_regex => $lookup->regex,
42   }   }
43    
44    C<tag> defines tag to use within C<xml_file>
45    
46    C<xml_file> defines path to normalize XML.
47    
48  Optional parameter C<cache_data_structure> defines path to directory  Optional parameter C<cache_data_structure> defines path to directory
49  in which cache file for C<data_structure> call will be created.  in which cache file for C<data_structure> call will be created.
50    
51    Recommended parametar C<lookup_regex> specify ...
52    
53  =cut  =cut
54    
55  sub new {  sub new {
# Line 67  sub new { Line 59  sub new {
59    
60          $self->setup_cache_dir( $self->{'cache_data_structure'} );          $self->setup_cache_dir( $self->{'cache_data_structure'} );
61    
         return $self;  
 }  
   
 =head2 open_import_xml  
   
 Read file from C<import_xml/> directory and parse it.  
   
  $webpac->open_import_xml(type => 'isis');  
   
 =cut  
   
 sub open_import_xml {  
         my $self = shift;  
   
62          my $log = $self->_get_logger();          my $log = $self->_get_logger();
63    
64          my $arg = {@_};          foreach my $req (qw/tag xml_file/) {
65          $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});                  $log->logconfess("need argument $req") unless $self->{$req};
66            }
         $self->{'type'} = $arg->{'type'};  
   
         my $type_base = $arg->{'type'};  
         $type_base =~ s/_.*$//g;  
67    
68          $self->{'tag'} = $type2tag{$type_base};          my $f =
69    
70          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");          my $xml_file = $self->{'xml_file'};
71    
72          my $f = "./import_xml/".$self->{'type'}.".xml";          $log->info("using $xml_file tag <",$self->{'tag'},">");
         $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");  
73    
74          $log->info("reading '$f'");          $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
75    
76          $self->{'import_xml_file'} = $f;          $self->{'import_xml_file'} = $xml_file;
77    
78          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
79                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
80          );          );
81    
82          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
83    
84            return $self;
85  }  }
86    
87  =head2 setup_cache_dir  =head2 setup_cache_dir
# Line 153  sub setup_cache_dir { Line 127  sub setup_cache_dir {
127    
128  =head2 data_structure  =head2 data_structure
129    
130  Create in-memory data structure which represents layout from C<import_xml>.  Create in-memory data structure which represents normalized layout from
131  It is used later to produce output.  C<conf/normalize/*.xml>.
132    
133    This structures are used to produce output.
134    
135   my @ds = $webpac->data_structure($rec);   my @ds = $webpac->data_structure($rec);
136    
137  This method will also set C<$webpac->{'currnet_filename'}> if there is  B<Note: historical oddity follows>
138  <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is  
139  <headline> tag.  This method will also set C<< $webpac->{'currnet_filename'} >> if there is
140    C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
141    C<< <headline> >> tag.
142    
143  =cut  =cut
144    
# Line 232  sub data_structure { Line 210  sub data_structure {
210                          $log->debug("format: $format");                          $log->debug("format: $format");
211    
212                          my @v;                          my @v;
213                          # FIXME this is a cludge!                          if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                         if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {  
214                                  @v = $self->fill_in_to_arr($rec,$format);                                  @v = $self->fill_in_to_arr($rec,$format);
215                          } else {                          } else {
216                                  @v = $self->parse_to_arr($rec,$format);                                  @v = $self->parse_to_arr($rec,$format);
# Line 324  sub data_structure { Line 301  sub data_structure {
301    
302  }  }
303    
304    =head2 apply_format
305    
306    Apply format specified in tag with C<format_name="name"> and
307    C<format_delimiter=";;">.
308    
309     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
310    
311    Formats can contain C<lookup{...}> if you need them.
312    
313    =cut
314    
315    sub apply_format {
316            my $self = shift;
317    
318            my ($name,$delimiter,$data) = @_;
319    
320            my $log = $self->_get_logger();
321    
322            if (! $self->{'import_xml'}->{'format'}->{$name}) {
323                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
324                    return $data;
325            }
326    
327            $log->warn("no delimiter for format $name") if (! $delimiter);
328    
329            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
330    
331            my @data = split(/\Q$delimiter\E/, $data);
332    
333            my $out = sprintf($format, @data);
334            $log->debug("using format $name [$format] on $data to produce: $out");
335    
336            if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
337                    return $self->lookup($out);
338            } else {
339                    return $out;
340            }
341    
342    }
343    
344    
345  =head1 AUTHOR  =head1 AUTHOR
346    
# Line 338  under the same terms as Perl itself. Line 355  under the same terms as Perl itself.
355    
356  =cut  =cut
357    
358  1; # End of WebPAC::Normalise::XML  1; # End of WebPAC::Normalize::XML

Legend:
Removed from v.10  
changed lines
  Added in v.12

  ViewVC Help
Powered by ViewVC 1.1.26