/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 209 by dpavlin, Mon Dec 5 17:46:57 2005 UTC revision 346 by dpavlin, Sat Jan 7 03:28:10 2006 UTC
# Line 11  WebPAC::Normalize - data mungling for no Line 11  WebPAC::Normalize - data mungling for no
11    
12  =head1 VERSION  =head1 VERSION
13    
14  Version 0.02  Version 0.08
15    
16  =cut  =cut
17    
18  our $VERSION = '0.02';  our $VERSION = '0.08';
19    
20  =head1 SYNOPSIS  =head1 SYNOPSIS
21    
# Line 47  optional C<filter{filter_name}> at B<beg Line 47  optional C<filter{filter_name}> at B<beg
47  code defined as code ref on format after field substitution to producing  code defined as code ref on format after field substitution to producing
48  output  output
49    
50    There is one built-in filter called C<regex> which can be use like this:
51    
52      filter{regex(s/foo/bar/)}
53    
54  =item *  =item *
55    
56  optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.  optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
# Line 82  Create new normalisation object Line 86  Create new normalisation object
86          db => $db_obj,          db => $db_obj,
87          lookup_regex => $lookup->regex,          lookup_regex => $lookup->regex,
88          lookup => $lookup_obj,          lookup => $lookup_obj,
89            prefix => 'foobar',
90    );    );
91    
92  Parametar C<filter> defines user supplied snippets of perl code which can  Parametar C<filter> defines user supplied snippets of perl code which can
93  be use with C<filter{...}> notation.  be use with C<filter{...}> notation.
94    
95    C<prefix> is used to form filename for database record (to support multiple
96    source files which are joined in one database).
97    
98  Recommended parametar C<lookup_regex> is used to enable parsing of lookups  Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99  in structures. If you pass this parametar, you must also pass C<lookup>  in structures. If you pass this parametar, you must also pass C<lookup>
100  which is C<WebPAC::Lookup> object.  which is C<WebPAC::Lookup> object.
# Line 111  sub new { Line 119  sub new {
119    
120          $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));          $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121    
122            $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123    
124            $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125    
126            if (! $self->{filter} || ! $self->{filter}->{regex}) {
127                    $log->debug("adding built-in filter regex");
128                    $self->{filter}->{regex} = sub {
129                            my ($val, $regex) = @_;
130                            eval "\$val =~ $regex";
131                            return $val;
132                    };
133            }
134    
135          $self ? return $self : return undef;          $self ? return $self : return undef;
136  }  }
137    
# Line 136  sub data_structure { Line 157  sub data_structure {
157    
158          $log->debug("data_structure rec = ", sub { Dumper($rec) });          $log->debug("data_structure rec = ", sub { Dumper($rec) });
159    
160          $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));          $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161    
162          my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");          my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163    
164          my $cache_file;          my $cache_file;
165    
166          if ($self->{'db'}) {          if ($self->{'db'}) {
167                  my $ds = $self->{'db'}->load_ds( $mfn );                  my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168                  $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });                  $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169                  return $ds if ($ds);                  return $ds if ($ds);
170                  $log->debug("cache miss, creating");                  $log->debug("cache miss, creating");
171          }          }
172    
         undef $self->{'currnet_filename'};  
         undef $self->{'headline'};  
   
173          my @sorted_tags;          my @sorted_tags;
174          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
175                  @sorted_tags = @{$self->{tags_by_order}};                  @sorted_tags = @{$self->{tags_by_order}};
# Line 207  sub data_structure { Line 225  sub data_structure {
225    
226                          foreach my $type (@types) {                          foreach my $type (@types) {
227                                  # append to previous line?                                  # append to previous line?
228                                  $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');                                  $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
229                                  if ($tag->{'append'}) {                                  if ($tag->{'append'}) {
230    
231                                          # I will delimit appended part with                                          # I will delimit appended part with
# Line 249  sub data_structure { Line 267  sub data_structure {
267          }          }
268    
269          $self->{'db'}->save_ds(          $self->{'db'}->save_ds(
270                  id => $mfn,                  id => $id,
271                  ds => $ds,                  ds => $ds,
272                    prefix => $self->{prefix},
273          ) if ($self->{'db'});          ) if ($self->{'db'});
274    
275          $log->debug("ds: ", sub { Dumper($ds) });          $log->debug("ds: ", sub { Dumper($ds) });
# Line 269  return output or nothing depending on ev Line 288  return output or nothing depending on ev
288    
289   my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);   my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290    
291    Filters are implemented here. While simple form of filters looks like this:
292    
293      filter{name_of_filter}
294    
295    but, filters can also have variable number of parametars like this:
296    
297      filter{name_of_filter(param,param,param)}
298    
299  =cut  =cut
300    
301    my $warn_once;
302    
303  sub parse {  sub parse {
304          my $self = shift;          my $self = shift;
305    
# Line 288  sub parse { Line 317  sub parse {
317    
318          my @out;          my @out;
319    
320          $log->debug("format: $format");          $log->debug("format: $format [$i]");
321    
322          my $eval_code;          my $eval_code;
323          # remove eval{...} from beginning          # remove eval{...} from beginning
# Line 298  sub parse { Line 327  sub parse {
327          # remove filter{...} from beginning          # remove filter{...} from beginning
328          $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);          $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329    
330            # did we found any (att all) field from format in row?
331            my $found_any;
332            # prefix before first field which we preserve it $found_any
333          my $prefix;          my $prefix;
334          my $all_found=0;  
335            my $f_step = 1;
336    
337          while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {          while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
338    
339                  my $del = $1 || '';                  my $del = $1 || '';
340                  $prefix ||= $del if ($all_found == 0);                  $prefix = $del if ($f_step == 1);
341    
342                    my $fld_type = lc($2);
343    
344                  # repeatable index                  # repeatable index
345                  my $r = $i;                  my $r = $i;
346                  $r = 0 if (lc("$2") eq 's');                  if ($fld_type eq 's') {
347                            if ($found_any->{'v'}) {
348                                    $r = 0;
349                            } else {
350                                    return;
351                            }
352                    }
353    
354                  my $found = 0;                  my $found = 0;
355                  my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);                  my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
356    
357                  if ($found) {                  if ($found) {
358                          push @out, $del;                          $found_any->{$fld_type} += $found;
359    
360                            # we will skip delimiter before first occurence of field!
361                            push @out, $del unless($found_any->{$fld_type} == 1);
362                          push @out, $tmp;                          push @out, $tmp;
                         $all_found += $found;  
363                  }                  }
364                    $f_step++;
365          }          }
366    
367          return if (! $all_found);          # test if any fields found?
368            return if (! $found_any->{'v'} && ! $found_any->{'s'});
369    
370          my $out = join('',@out);          my $out = join('',@out);
371    
# Line 340  sub parse { Line 385  sub parse {
385                  return if (! $self->_eval($eval));                  return if (! $self->_eval($eval));
386          }          }
387                    
388          if ($filter_name && $self->{'filter'}->{$filter_name}) {          if ($filter_name) {
389                  $log->debug("about to filter{$filter_name} format: $out");                  my @filter_args;
390                  $out = $self->{'filter'}->{$filter_name}->($out);                  if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
391                  return unless(defined($out));                          @filter_args = split(/,/, $2);
392                  $log->debug("filter result: $out");                  }
393                    if ($self->{'filter'}->{$filter_name}) {
394                            $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
395                            unshift @filter_args, $out;
396                            $out = $self->{'filter'}->{$filter_name}->(@filter_args);
397                            return unless(defined($out));
398                            $log->debug("filter result: $out");
399                    } elsif (! $warn_once->{$filter_name}) {
400                            $log->warn("trying to use undefined filter $filter_name");
401                            $warn_once->{$filter_name}++;
402                    }
403          }          }
404    
405          return $out;          return $out;
# Line 424  sub fill_in { Line 479  sub fill_in {
479          }          }
480    
481          my $found = 0;          my $found = 0;
482            my $just_single = 1;
483    
484          my $eval_code;          my $eval_code;
485          # remove eval{...} from beginning          # remove eval{...} from beginning
# Line 435  sub fill_in { Line 491  sub fill_in {
491    
492          # do actual replacement of placeholders          # do actual replacement of placeholders
493          # repeatable fields          # repeatable fields
494          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
495                    $just_single = 0;
496            }
497            
498          # non-repeatable fields          # non-repeatable fields
499          $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;          if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
500                    return if ($i > 0 && $just_single);
501            }
502    
503          if ($found) {          if ($found) {
504                  $log->debug("format: $format");                  $log->debug("format: $format");
# Line 454  sub fill_in { Line 515  sub fill_in {
515                  # do we have lookups?                  # do we have lookups?
516                  if ($self->{'lookup'}) {                  if ($self->{'lookup'}) {
517                          if ($self->{'lookup'}->can('lookup')) {                          if ($self->{'lookup'}->can('lookup')) {
518                                  return $self->{'lookup'}->lookup($format);                                  my @lookup = $self->{lookup}->lookup($format);
519                                    $log->debug("lookup $format", join(", ", @lookup));
520                                    return @lookup;
521                          } else {                          } else {
522                                  $log->warn("Have lookup object but can't invoke lookup method");                                  $log->warn("Have lookup object but can't invoke lookup method");
523                          }                          }
# Line 537  sub get_data { Line 600  sub get_data {
600                          if ($$rec->{$f}->[$i] =~ /HASH/o) {                          if ($$rec->{$f}->[$i] =~ /HASH/o) {
601                                  my $out;                                  my $out;
602                                  foreach my $k (keys %{$$rec->{$f}->[$i]}) {                                  foreach my $k (keys %{$$rec->{$f}->[$i]}) {
603                                          $out .= $$rec->{$f}->[$i]->{$k}." ";                                          my $v = $$rec->{$f}->[$i]->{$k};
604                                            $out .= "$v " if ($v);
605                                  }                                  }
606                                  return $out;                                  return $out;
607                          } else {                          } else {

Legend:
Removed from v.209  
changed lines
  Added in v.346

  ViewVC Help
Powered by ViewVC 1.1.26