--- trunk/lib/WebPAC/Normalize.pm 2005/07/16 20:35:30 10 +++ trunk/lib/WebPAC/Normalize.pm 2005/12/16 14:40:55 260 @@ -2,33 +2,700 @@ use warnings; use strict; +use base 'WebPAC::Common'; +use Data::Dumper; =head1 NAME -WebPAC::Normalize - normalisation of source file +WebPAC::Normalize - data mungling for normalisation =head1 VERSION -Version 0.01 +Version 0.06 =cut -our $VERSION = '0.01'; +our $VERSION = '0.06'; =head1 SYNOPSIS -This package contains code that could be helpful in implementing different -normalisation front-ends. +This package contains code that mungle data to produce normalized format. + +It contains several assumptions: + +=over + +=item * + +format of fields is defined using C notation for repeatable fields +or C for single (or first) value, where C<123> is field number and +C is subfield. + +=item * + +source data records (C<$rec>) have unique identifiers in field C<000> + +=item * + +optional C tag at B will be +perl code that is evaluated before producing output (value of field will be +interpolated before that) + +=item * + +optional C at B will apply perl +code defined as code ref on format after field substitution to producing +output + +There is one built-in filter called C which can be use like this: + + filter{regex(s/foo/bar/)} + +=item * + +optional C will be then performed. See C. + +=item * + +at end, optional Cs rules are resolved. Format rules are similar to +C and can also contain C which is performed after +values are inserted in format. + +=back + +This also describes order in which transformations are applied (eval, +filter, lookup, format) which is important to undestand when deciding how to +solve your data mungling and normalisation process. + + + =head1 FUNCTIONS -=head2 none_yet +=head2 new + +Create new normalisation object + + my $n = new WebPAC::Normalize::Something( + filter => { + 'filter_name_1' => sub { + # filter code + return length($_); + }, ... + }, + db => $db_obj, + lookup_regex => $lookup->regex, + lookup => $lookup_obj, + prefix => 'foobar', + ); + +Parametar C defines user supplied snippets of perl code which can +be use with C notation. + +C is used to form filename for database record (to support multiple +source files which are joined in one database). + +Recommended parametar C is used to enable parsing of lookups +in structures. If you pass this parametar, you must also pass C +which is C object. + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + my $r = $self->{'lookup_regex'} ? 1 : 0; + my $l = $self->{'lookup'} ? 1 : 0; + + my $log = $self->_get_logger(); + + # those two must be in pair + if ( ($r & $l) != ($r || $l) ) { + my $log = $self->_get_logger(); + $log->logdie("lookup_regex and lookup must be in pair"); + } + + $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup')); + + $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'}); + + $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l); + + if ($self->{filter} && ! $self->{filter}->{regex}) { + $log->debug("adding built-in filter regex"); + $self->{filter}->{regex} = sub { + my ($val, $regex) = @_; + eval "\$val =~ $regex"; + return $val; + }; + } + + $self ? return $self : return undef; +} + + +=head2 data_structure + +Create in-memory data structure which represents normalized layout from +C. + +This structures are used to produce output. + + my $ds = $webpac->data_structure($rec); + +=cut + +sub data_structure { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift; + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + $log->debug("data_structure rec = ", sub { Dumper($rec) }); + + $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'})); + + my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!"); + + my $cache_file; + + if ($self->{'db'}) { + my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} ); + $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) }); + return $ds if ($ds); + $log->debug("cache miss, creating"); + } + + undef $self->{'currnet_filename'}; + undef $self->{'headline'}; + + my @sorted_tags; + if ($self->{tags_by_order}) { + @sorted_tags = @{$self->{tags_by_order}}; + } else { + @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}}; + $self->{tags_by_order} = \@sorted_tags; + } + + my $ds; + + $log->debug("tags: ",sub { join(", ",@sorted_tags) }); + + foreach my $field (@sorted_tags) { + + my $row; + +#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); + + foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { + my $format; + + $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH'); + $format = $tag->{'value'} || $tag->{'content'}; + + $log->debug("format: $format"); + + my @v; + if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { + @v = $self->fill_in_to_arr($rec,$format); + } else { + @v = $self->parse_to_arr($rec,$format); + } + next if (! @v); + + if ($tag->{'sort'}) { + @v = $self->sort_arr(@v); + } + + # use format? + if ($tag->{'format_name'}) { + @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; + } + + # delimiter will join repeatable fields + if ($tag->{'delimiter'}) { + @v = ( join($tag->{'delimiter'}, @v) ); + } + + # default types + my @types = qw(display search); + # override by type attribute + @types = ( $tag->{'type'} ) if ($tag->{'type'}); + + foreach my $type (@types) { + # append to previous line? + $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append'); + if ($tag->{'append'}) { + + # I will delimit appended part with + # delimiter (or ,) + my $d = $tag->{'delimiter'}; + # default delimiter + $d ||= " "; + + my $last = pop @{$row->{$type}}; + $d = "" if (! $last); + $last .= $d . join($d, @v); + push @{$row->{$type}}, $last; + + } else { + push @{$row->{$type}}, @v; + } + } + + + } + + if ($row) { + $row->{'tag'} = $field; + + # TODO: name_sigular, name_plural + my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; + my $row_name = $name ? $self->_x($name) : $field; + + # post-sort all values in field + if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { + $log->warn("sort at field tag not implemented"); + } + + $ds->{$row_name} = $row; + + $log->debug("row $field: ",sub { Dumper($row) }); + } + + } + + $self->{'db'}->save_ds( + id => $id, + ds => $ds, + prefix => $self->{prefix}, + ) if ($self->{'db'}); + + $log->debug("ds: ", sub { Dumper($ds) }); + + $log->logconfess("data structure returned is not array any more!") if wantarray; + + return $ds; + +} + +=head2 parse + +Perform smart parsing of string, skipping delimiters for fields which aren't +defined. It can also eval code in format starting with C and +return output or nothing depending on eval code. + + my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); + +Filters are implemented here. While simple form of filters looks like this: + + filter{name_of_filter} + +but, filters can also have variable number of parametars like this: + + filter{name_of_filter(param,param,param)} + +=cut + +sub parse { + my $self = shift; + + my ($rec, $format_utf8, $i) = @_; + + return if (! $format_utf8); + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + $i = 0 if (! $i); + + my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); + + my @out; + + $log->debug("format: $format"); + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + my $prefix; + my $all_found=0; + + while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) { + + my $del = $1 || ''; + $prefix ||= $del if ($all_found == 0); + + # repeatable index + my $r = $i; + $r = 0 if (lc("$2") eq 's'); + + my $found = 0; + my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found); + + if ($found) { + push @out, $del; + push @out, $tmp; + $all_found += $found; + } + } + + return if (! $all_found); + + my $out = join('',@out); + + if ($out) { + # add rest of format (suffix) + $out .= $format; + + # add prefix if not there + $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + + $log->debug("result: $out"); + } + + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i) || return; + $log->debug("about to eval{$eval} format: $out"); + return if (! $self->_eval($eval)); + } + + if ($filter_name) { + my @filter_args; + if ($filter_name =~ s/(\w+)\((.*)\)/$1/) { + @filter_args = split(/,/, $2); + } + if ($self->{'filter'}->{$filter_name}) { + $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args)); + unshift @filter_args, $out; + $out = $self->{'filter'}->{$filter_name}->(@filter_args); + return unless(defined($out)); + $log->debug("filter result: $out"); + } else { + $log->warn("trying to use undefined filter $filter_name"); + } + } + + return $out; +} + +=head2 parse_to_arr + +Similar to C, but returns array of all repeatable fields + + my @arr = $webpac->parse_to_arr($rec,'v250^a'); =cut -sub none_yet { +sub parse_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my $v = $self->parse($rec,$format_utf8,$i++)) { + push @arr, $v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; } + +=head2 fill_in + +Workhourse of all: takes record from in-memory structure of database and +strings with placeholders and returns string or array of with substituted +values from record. + + my $text = $webpac->fill_in($rec,'v250^a'); + +Optional argument is ordinal number for repeatable fields. By default, +it's assume to be first repeatable field (fields are perl array, so first +element is 0). +Following example will read second value from repeatable field. + + my $text = $webpac->fill_in($rec,'Title: v250^a',1); + +This function B perform parsing of format to inteligenty skip +delimiters before fields which aren't used. + +This method will automatically decode UTF-8 string to local code page +if needed. + +=cut + +sub fill_in { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need data record"); + my $format = shift || $log->logconfess("need format to parse"); + # iteration (for repeatable fields) + my $i = shift || 0; + + $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999)); + + # FIXME remove for speedup? + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); + } + + my $found = 0; + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + # do actual replacement of placeholders + # repeatable fields + $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; + # non-repeatable fields + $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges; + + if ($found) { + $log->debug("format: $format"); + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i); + return if (! $self->_eval($eval)); + } + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("filter '$filter_name' for $format"); + $format = $self->{'filter'}->{$filter_name}->($format); + return unless(defined($format)); + $log->debug("filter result: $format"); + } + # do we have lookups? + if ($self->{'lookup'}) { + if ($self->{'lookup'}->can('lookup')) { + my @lookup = $self->{lookup}->lookup($format); + $log->debug("lookup $format", join(", ", @lookup)); + return @lookup; + } else { + $log->warn("Have lookup object but can't invoke lookup method"); + } + } else { + return $format; + } + } else { + return; + } +} + + +=head2 fill_in_to_arr + +Similar to C, but returns array of all repeatable fields. Usable +for fields which have lookups, so they shouldn't be parsed but rather +Ced. + + my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]'); + +=cut + +sub fill_in_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my @v = $self->fill_in($rec,$format_utf8,$i++)) { + push @arr, @v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + + +=head2 get_data + +Returns value from record. + + my $text = $self->get_data(\$rec,$f,$sf,$i,\$found); + +Arguments are: +record reference C<$rec>, +field C<$f>, +optional subfiled C<$sf>, +index for repeatable values C<$i>. + +Optinal variable C<$found> will be incremeted if there +is field. + +Returns value or empty string. + +=cut + +sub get_data { + my $self = shift; + + my ($rec,$f,$sf,$i,$found) = @_; + + if ($$rec->{$f}) { + return '' if (! $$rec->{$f}->[$i]); + no strict 'refs'; + if ($sf && $$rec->{$f}->[$i]->{$sf}) { + $$found++ if (defined($$found)); + return $$rec->{$f}->[$i]->{$sf}; + } elsif (! $sf && $$rec->{$f}->[$i]) { + $$found++ if (defined($$found)); + # it still might have subfield, just + # not specified, so we'll dump all + if ($$rec->{$f}->[$i] =~ /HASH/o) { + my $out; + foreach my $k (keys %{$$rec->{$f}->[$i]}) { + $out .= $$rec->{$f}->[$i]->{$k}." "; + } + return $out; + } else { + return $$rec->{$f}->[$i]; + } + } else { + return ''; + } + } else { + return ''; + } +} + + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->{'lookup'}->lookup($out); + } else { + return $out; + } + +} + +=head2 sort_arr + +Sort array ignoring case and html in data + + my @sorted = $webpac->sort_arr(@unsorted); + +=cut + +sub sort_arr { + my $self = shift; + + my $log = $self->_get_logger(); + + # FIXME add Schwartzian Transformation? + + my @sorted = sort { + $a =~ s#<[^>]+/*>##; + $b =~ s#<[^>]+/*>##; + lc($b) cmp lc($a) + } @_; + $log->debug("sorted values: ",sub { join(", ",@sorted) }); + + return @sorted; +} + + +=head1 INTERNAL METHODS + +=head2 _sort_by_order + +Sort xml tags data structure accoding to C attribute. + +=cut + +sub _sort_by_order { + my $self = shift; + + my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$a}; + my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$b}; + + return $va <=> $vb; +} + +=head2 _x + +Convert strings from C encoding into application +specific encoding (optinally specified using C to C +constructor). + + my $text = $n->_x('normalize text string'); + +This is a stub so that other modules doesn't have to implement it. + +=cut + +sub _x { + my $self = shift; + return shift; +} + + =head1 AUTHOR Dobrica Pavlinusic, C<< >> @@ -42,4 +709,4 @@ =cut -1; # End of WebPAC::DB +1; # End of WebPAC::Normalize