--- trunk/lib/WebPAC/Normalize.pm 2005/07/17 00:04:25 14 +++ trunk/lib/WebPAC/Normalize.pm 2005/07/17 10:42:23 15 @@ -7,7 +7,7 @@ =head1 NAME -WebPAC::Normalize - normalisation of source file +WebPAC::Normalize - data mungling for normalisation =head1 VERSION @@ -19,8 +19,52 @@ =head1 SYNOPSIS -This package contains code that could be helpful in implementing different -normalisation front-ends. +This package contains code that mungle data to produce normalized format. + +It contains several assumptions: + +=over + +=item * + +format of fields is defined using C notation for repeatable fields +or C for single (or first) value, where C<123> is field number and +C is subfield. + +=item * + +source data records (C<$rec>) have unique identifiers in field C<000> + +=item * + +optional C tag at B will be +perl code that is evaluated before producing output (value of field will be +interpolated before that) + +=item * + +optional C at B will apply perl +code defined as code ref on format after field substitution to producing +output + +=item * + +optional C will be then performed. See C. + +=item * + +at end, optional Cs rules are resolved. Format rules are similar to +C and can also contain C which is performed after +values are inserted in format. + +=back + +This also describes order in which transformations are applied (eval, +filter, lookup, format) which is important to undestand when deciding how to +solve your data mungling and normalisation process. + + + =head1 FUNCTIONS @@ -29,10 +73,19 @@ Create new normalisation object my $n = new WebPAC::Normalize::Something( + filter => { + 'filter_name_1' => sub { + # filter code + return length($_); + }, ... + }, cache_data_structure => './cache/ds/', lookup_regex => $lookup->regex, ); +Parametar C defines user supplied snippets of perl code which can +be use with C notation. + Optional parameter C defines path to directory in which cache file for C call will be created. @@ -268,46 +321,6 @@ } -=head2 apply_format - -Apply format specified in tag with C and -C. - - my $text = $webpac->apply_format($format_name,$format_delimiter,$data); - -Formats can contain C if you need them. - -=cut - -sub apply_format { - my $self = shift; - - my ($name,$delimiter,$data) = @_; - - my $log = $self->_get_logger(); - - if (! $self->{'import_xml'}->{'format'}->{$name}) { - $log->warn(" is not defined in ",$self->{'import_xml_file'}); - return $data; - } - - $log->warn("no delimiter for format $name") if (! $delimiter); - - my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); - - my @data = split(/\Q$delimiter\E/, $data); - - my $out = sprintf($format, @data); - $log->debug("using format $name [$format] on $data to produce: $out"); - - if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { - return $self->lookup($out); - } else { - return $out; - } - -} - =head2 parse Perform smart parsing of string, skipping delimiters for fields which aren't @@ -427,6 +440,89 @@ return @arr; } + +=head2 fill_in + +Workhourse of all: takes record from in-memory structure of database and +strings with placeholders and returns string or array of with substituted +values from record. + + my $text = $webpac->fill_in($rec,'v250^a'); + +Optional argument is ordinal number for repeatable fields. By default, +it's assume to be first repeatable field (fields are perl array, so first +element is 0). +Following example will read second value from repeatable field. + + my $text = $webpac->fill_in($rec,'Title: v250^a',1); + +This function B perform parsing of format to inteligenty skip +delimiters before fields which aren't used. + +This method will automatically decode UTF-8 string to local code page +if needed. + +=cut + +sub fill_in { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need data record"); + my $format = shift || $log->logconfess("need format to parse"); + # iteration (for repeatable fields) + my $i = shift || 0; + + $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999)); + + # FIXME remove for speedup? + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); + } + + my $found = 0; + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + # do actual replacement of placeholders + # repeatable fields + $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; + # non-repeatable fields + $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges; + + if ($found) { + $log->debug("format: $format"); + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i); + return if (! $self->_eval($eval)); + } + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("filter '$filter_name' for $format"); + $format = $self->{'filter'}->{$filter_name}->($format); + return unless(defined($format)); + $log->debug("filter result: $format"); + } + # do we have lookups? + if ($self->{'lookup'}) { + return $self->lookup($format); + } else { + return $format; + } + } else { + return; + } +} + + =head2 fill_in_to_arr Similar to C, but returns array of all repeatable fields. Usable @@ -459,6 +555,97 @@ return @arr; } + +=head2 get_data + +Returns value from record. + + my $text = $self->get_data(\$rec,$f,$sf,$i,\$found); + +Arguments are: +record reference C<$rec>, +field C<$f>, +optional subfiled C<$sf>, +index for repeatable values C<$i>. + +Optinal variable C<$found> will be incremeted if there +is field. + +Returns value or empty string. + +=cut + +sub get_data { + my $self = shift; + + my ($rec,$f,$sf,$i,$found) = @_; + + if ($$rec->{$f}) { + return '' if (! $$rec->{$f}->[$i]); + no strict 'refs'; + if ($sf && $$rec->{$f}->[$i]->{$sf}) { + $$found++ if (defined($$found)); + return $$rec->{$f}->[$i]->{$sf}; + } elsif ($$rec->{$f}->[$i]) { + $$found++ if (defined($$found)); + # it still might have subfield, just + # not specified, so we'll dump all + if ($$rec->{$f}->[$i] =~ /HASH/o) { + my $out; + foreach my $k (keys %{$$rec->{$f}->[$i]}) { + $out .= $$rec->{$f}->[$i]->{$k}." "; + } + return $out; + } else { + return $$rec->{$f}->[$i]; + } + } + } else { + return ''; + } +} + + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->lookup($out); + } else { + return $out; + } + +} + =head2 sort_arr Sort array ignoring case and html in data @@ -485,6 +672,8 @@ } +=head1 INTERNAL METHODS + =head2 _sort_by_order Sort xml tags data structure accoding to C attribute. @@ -504,8 +693,9 @@ =head2 _x -Convert strings from C encoding into application specific -(optinally specified using C to C constructor. +Convert strings from C encoding into application +specific encoding (optinally specified using C to C +constructor). my $text = $n->_x('normalize text string');