--- trunk/lib/WebPAC/Normalize.pm 2005/07/17 00:04:25 14 +++ trunk/lib/WebPAC/Normalize.pm 2005/11/24 11:47:15 125 @@ -2,25 +2,69 @@ use warnings; use strict; +use base 'WebPAC::Common'; use Data::Dumper; -use Storable; =head1 NAME -WebPAC::Normalize - normalisation of source file +WebPAC::Normalize - data mungling for normalisation =head1 VERSION -Version 0.01 +Version 0.02 =cut -our $VERSION = '0.01'; +our $VERSION = '0.02'; =head1 SYNOPSIS -This package contains code that could be helpful in implementing different -normalisation front-ends. +This package contains code that mungle data to produce normalized format. + +It contains several assumptions: + +=over + +=item * + +format of fields is defined using C notation for repeatable fields +or C for single (or first) value, where C<123> is field number and +C is subfield. + +=item * + +source data records (C<$rec>) have unique identifiers in field C<000> + +=item * + +optional C tag at B will be +perl code that is evaluated before producing output (value of field will be +interpolated before that) + +=item * + +optional C at B will apply perl +code defined as code ref on format after field substitution to producing +output + +=item * + +optional C will be then performed. See C. + +=item * + +at end, optional Cs rules are resolved. Format rules are similar to +C and can also contain C which is performed after +values are inserted in format. + +=back + +This also describes order in which transformations are applied (eval, +filter, lookup, format) which is important to undestand when deciding how to +solve your data mungling and normalisation process. + + + =head1 FUNCTIONS @@ -29,15 +73,23 @@ Create new normalisation object my $n = new WebPAC::Normalize::Something( - cache_data_structure => './cache/ds/', + filter => { + 'filter_name_1' => sub { + # filter code + return length($_); + }, ... + }, + db => $db_obj, lookup_regex => $lookup->regex, + lookup => $lookup_obj, ); -Optional parameter C defines path to directory -in which cache file for C call will be created. +Parametar C defines user supplied snippets of perl code which can +be use with C notation. Recommended parametar C is used to enable parsing of lookups -in structures. +in structures. If you pass this parametar, you must also pass C +which is C object. =cut @@ -46,49 +98,20 @@ my $self = {@_}; bless($self, $class); - $self->setup_cache_dir( $self->{'cache_data_structure'} ); - - $self ? return $self : return undef; -} - -=head2 setup_cache_dir - -Check if specified cache directory exist, and if not, disable caching. - - $setup_cache_dir('./cache/ds/'); - -If you pass false or zero value to this function, it will disable -cacheing. - -=cut - -sub setup_cache_dir { - my $self = shift; - - my $dir = shift; + my $r = $self->{'lookup_regex'} ? 1 : 0; + my $l = $self->{'lookup'} ? 1 : 0; my $log = $self->_get_logger(); - if ($dir) { - my $msg; - if (! -e $dir) { - $msg = "doesn't exist"; - } elsif (! -d $dir) { - $msg = "is not directory"; - } elsif (! -w $dir) { - $msg = "not writable"; - } - - if ($msg) { - undef $self->{'cache_data_structure'}; - $log->warn("cache_data_structure $dir $msg, disabling..."); - } else { - $log->debug("using cache dir $dir"); - } - } else { - $log->debug("disabling cache"); - undef $self->{'cache_data_structure'}; + # those two must be in pair + if ( ($r & $l) != ($r || $l) ) { + my $log = $self->_get_logger(); + $log->logdie("lookup_regex and lookup must be in pair"); } + + $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup')); + + $self ? return $self : return undef; } @@ -99,13 +122,7 @@ This structures are used to produce output. - my @ds = $webpac->data_structure($rec); - -B - -This method will also set C<< $webpac->{'currnet_filename'} >> if there is -C<< >> tag and C<< $webpac->{'headline'} >> if there is -C<< >> tag. + my $ds = $webpac->data_structure($rec); =cut @@ -117,37 +134,19 @@ my $rec = shift; $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->debug("data_structure rec = ", sub { Dumper($rec) }); + + $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'})); + + my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!"); + my $cache_file; - if (my $cache_path = $self->{'cache_data_structure'}) { - my $id = $rec->{'000'}; - $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o); - unless (defined($id)) { - $log->warn("Can't use cache_data_structure on records without unique identifier in field 000"); - undef $self->{'cache_data_structure'}; - } else { - $cache_file = "$cache_path/$id"; - if (-r $cache_file) { - my $ds_ref = retrieve($cache_file); - if ($ds_ref) { - $log->debug("cache hit: $cache_file"); - my $ok = 1; - foreach my $f (qw(current_filename headline)) { - if ($ds_ref->{$f}) { - $self->{$f} = $ds_ref->{$f}; - } else { - $ok = 0; - } - }; - if ($ok && $ds_ref->{'ds'}) { - return @{ $ds_ref->{'ds'} }; - } else { - $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!"); - undef $self->{'cache_data_structure'}; - } - } - } - } + if ($self->{'db'}) { + my $ds = $self->{'db'}->load_ds( $mfn ); + $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) }); + return $ds if ($ds); + $log->debug("cache miss, creating"); } undef $self->{'currnet_filename'}; @@ -161,7 +160,7 @@ $self->{tags_by_order} = \@sorted_tags; } - my @ds; + my $ds; $log->debug("tags: ",sub { join(", ",@sorted_tags) }); @@ -172,7 +171,10 @@ #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { - my $format = $tag->{'value'} || $tag->{'content'}; + my $format; + + $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH'); + $format = $tag->{'value'} || $tag->{'content'}; $log->debug("format: $format"); @@ -193,22 +195,13 @@ @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; } - if ($field eq 'filename') { - $self->{'current_filename'} = join('',@v); - $log->debug("filename: ",$self->{'current_filename'}); - } elsif ($field eq 'headline') { - $self->{'headline'} .= join('',@v); - $log->debug("headline: ",$self->{'headline'}); - next; # don't return headline in data_structure! - } - # delimiter will join repeatable fields if ($tag->{'delimiter'}) { @v = ( join($tag->{'delimiter'}, @v) ); } # default types - my @types = qw(display swish); + my @types = qw(display search); # override by type attribute @types = ( $tag->{'type'} ) if ($tag->{'type'}); @@ -241,70 +234,30 @@ # TODO: name_sigular, name_plural my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; - $row->{'name'} = $name ? $self->_x($name) : $field; + my $row_name = $name ? $self->_x($name) : $field; # post-sort all values in field if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { $log->warn("sort at field tag not implemented"); } - push @ds, $row; + $ds->{$row_name} = $row; $log->debug("row $field: ",sub { Dumper($row) }); } } - if ($cache_file) { - store { - ds => \@ds, - current_filename => $self->{'current_filename'}, - headline => $self->{'headline'}, - }, $cache_file; - $log->debug("created storable cache file $cache_file"); - } - - return @ds; - -} - -=head2 apply_format - -Apply format specified in tag with C and -C. - - my $text = $webpac->apply_format($format_name,$format_delimiter,$data); - -Formats can contain C if you need them. - -=cut - -sub apply_format { - my $self = shift; - - my ($name,$delimiter,$data) = @_; - - my $log = $self->_get_logger(); - - if (! $self->{'import_xml'}->{'format'}->{$name}) { - $log->warn(" is not defined in ",$self->{'import_xml_file'}); - return $data; - } - - $log->warn("no delimiter for format $name") if (! $delimiter); - - my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + $self->{'db'}->save_ds( + id => $mfn, + ds => $ds, + ) if ($self->{'db'}); - my @data = split(/\Q$delimiter\E/, $data); + $log->debug("ds: ", sub { Dumper($ds) }); - my $out = sprintf($format, @data); - $log->debug("using format $name [$format] on $data to produce: $out"); + $log->logconfess("data structure returned is not array any more!") if wantarray; - if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { - return $self->lookup($out); - } else { - return $out; - } + return $ds; } @@ -427,6 +380,93 @@ return @arr; } + +=head2 fill_in + +Workhourse of all: takes record from in-memory structure of database and +strings with placeholders and returns string or array of with substituted +values from record. + + my $text = $webpac->fill_in($rec,'v250^a'); + +Optional argument is ordinal number for repeatable fields. By default, +it's assume to be first repeatable field (fields are perl array, so first +element is 0). +Following example will read second value from repeatable field. + + my $text = $webpac->fill_in($rec,'Title: v250^a',1); + +This function B perform parsing of format to inteligenty skip +delimiters before fields which aren't used. + +This method will automatically decode UTF-8 string to local code page +if needed. + +=cut + +sub fill_in { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need data record"); + my $format = shift || $log->logconfess("need format to parse"); + # iteration (for repeatable fields) + my $i = shift || 0; + + $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999)); + + # FIXME remove for speedup? + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); + } + + my $found = 0; + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + # do actual replacement of placeholders + # repeatable fields + $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; + # non-repeatable fields + $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges; + + if ($found) { + $log->debug("format: $format"); + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i); + return if (! $self->_eval($eval)); + } + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("filter '$filter_name' for $format"); + $format = $self->{'filter'}->{$filter_name}->($format); + return unless(defined($format)); + $log->debug("filter result: $format"); + } + # do we have lookups? + if ($self->{'lookup'}) { + if ($self->{'lookup'}->can('lookup')) { + return $self->{'lookup'}->lookup($format); + } else { + $log->warn("Have lookup object but can't invoke lookup method"); + } + } else { + return $format; + } + } else { + return; + } +} + + =head2 fill_in_to_arr Similar to C, but returns array of all repeatable fields. Usable @@ -459,6 +499,99 @@ return @arr; } + +=head2 get_data + +Returns value from record. + + my $text = $self->get_data(\$rec,$f,$sf,$i,\$found); + +Arguments are: +record reference C<$rec>, +field C<$f>, +optional subfiled C<$sf>, +index for repeatable values C<$i>. + +Optinal variable C<$found> will be incremeted if there +is field. + +Returns value or empty string. + +=cut + +sub get_data { + my $self = shift; + + my ($rec,$f,$sf,$i,$found) = @_; + + if ($$rec->{$f}) { + return '' if (! $$rec->{$f}->[$i]); + no strict 'refs'; + if ($sf && $$rec->{$f}->[$i]->{$sf}) { + $$found++ if (defined($$found)); + return $$rec->{$f}->[$i]->{$sf}; + } elsif (! $sf && $$rec->{$f}->[$i]) { + $$found++ if (defined($$found)); + # it still might have subfield, just + # not specified, so we'll dump all + if ($$rec->{$f}->[$i] =~ /HASH/o) { + my $out; + foreach my $k (keys %{$$rec->{$f}->[$i]}) { + $out .= $$rec->{$f}->[$i]->{$k}." "; + } + return $out; + } else { + return $$rec->{$f}->[$i]; + } + } else { + return ''; + } + } else { + return ''; + } +} + + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->{'lookup'}->lookup($out); + } else { + return $out; + } + +} + =head2 sort_arr Sort array ignoring case and html in data @@ -485,6 +618,8 @@ } +=head1 INTERNAL METHODS + =head2 _sort_by_order Sort xml tags data structure accoding to C attribute. @@ -504,8 +639,9 @@ =head2 _x -Convert strings from C encoding into application specific -(optinally specified using C to C constructor. +Convert strings from C encoding into application +specific encoding (optinally specified using C to C +constructor). my $text = $n->_x('normalize text string');