--- trunk/lib/WebPAC/Normalize.pm 2005/07/16 20:35:30 10 +++ trunk/lib/WebPAC/Normalize.pm 2005/07/16 23:56:14 13 @@ -2,6 +2,7 @@ use warnings; use strict; +use Data::Dumper; =head1 NAME @@ -22,13 +23,501 @@ =head1 FUNCTIONS -=head2 none_yet +=head2 new + +Create new normalisation object + + my $n = new WebPAC::Normalize::Something( + cache_data_structure => './cache/ds/', + lookup_regex => $lookup->regex, + ); + +Optional parameter C defines path to directory +in which cache file for C call will be created. + +Recommended parametar C is used to enable parsing of lookups +in structures. + +=cut + +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + $self->setup_cache_dir( $self->{'cache_data_structure'} ); + + $self ? return $self : return undef; +} + +=head2 setup_cache_dir + +Check if specified cache directory exist, and if not, disable caching. + + $setup_cache_dir('./cache/ds/'); + +If you pass false or zero value to this function, it will disable +cacheing. =cut -sub none_yet { +sub setup_cache_dir { + my $self = shift; + + my $dir = shift; + + my $log = $self->_get_logger(); + + if ($dir) { + my $msg; + if (! -e $dir) { + $msg = "doesn't exist"; + } elsif (! -d $dir) { + $msg = "is not directory"; + } elsif (! -w $dir) { + $msg = "not writable"; + } + + if ($msg) { + undef $self->{'cache_data_structure'}; + $log->warn("cache_data_structure $dir $msg, disabling..."); + } else { + $log->debug("using cache dir $dir"); + } + } else { + $log->debug("disabling cache"); + undef $self->{'cache_data_structure'}; + } } + +=head2 data_structure + +Create in-memory data structure which represents normalized layout from +C. + +This structures are used to produce output. + + my @ds = $webpac->data_structure($rec); + +B + +This method will also set C<< $webpac->{'currnet_filename'} >> if there is +C<< >> tag and C<< $webpac->{'headline'} >> if there is +C<< >> tag. + +=cut + +sub data_structure { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift; + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + my $cache_file; + + if (my $cache_path = $self->{'cache_data_structure'}) { + my $id = $rec->{'000'}; + $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o); + unless (defined($id)) { + $log->warn("Can't use cache_data_structure on records without unique identifier in field 000"); + undef $self->{'cache_data_structure'}; + } else { + $cache_file = "$cache_path/$id"; + if (-r $cache_file) { + my $ds_ref = retrieve($cache_file); + if ($ds_ref) { + $log->debug("cache hit: $cache_file"); + my $ok = 1; + foreach my $f (qw(current_filename headline)) { + if ($ds_ref->{$f}) { + $self->{$f} = $ds_ref->{$f}; + } else { + $ok = 0; + } + }; + if ($ok && $ds_ref->{'ds'}) { + return @{ $ds_ref->{'ds'} }; + } else { + $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!"); + undef $self->{'cache_data_structure'}; + } + } + } + } + } + + undef $self->{'currnet_filename'}; + undef $self->{'headline'}; + + my @sorted_tags; + if ($self->{tags_by_order}) { + @sorted_tags = @{$self->{tags_by_order}}; + } else { + @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}}; + $self->{tags_by_order} = \@sorted_tags; + } + + my @ds; + + $log->debug("tags: ",sub { join(", ",@sorted_tags) }); + + foreach my $field (@sorted_tags) { + + my $row; + +#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); + + foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { + my $format = $tag->{'value'} || $tag->{'content'}; + + $log->debug("format: $format"); + + my @v; + if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { + @v = $self->fill_in_to_arr($rec,$format); + } else { + @v = $self->parse_to_arr($rec,$format); + } + next if (! @v); + + if ($tag->{'sort'}) { + @v = $self->sort_arr(@v); + } + + # use format? + if ($tag->{'format_name'}) { + @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; + } + + if ($field eq 'filename') { + $self->{'current_filename'} = join('',@v); + $log->debug("filename: ",$self->{'current_filename'}); + } elsif ($field eq 'headline') { + $self->{'headline'} .= join('',@v); + $log->debug("headline: ",$self->{'headline'}); + next; # don't return headline in data_structure! + } + + # delimiter will join repeatable fields + if ($tag->{'delimiter'}) { + @v = ( join($tag->{'delimiter'}, @v) ); + } + + # default types + my @types = qw(display swish); + # override by type attribute + @types = ( $tag->{'type'} ) if ($tag->{'type'}); + + foreach my $type (@types) { + # append to previous line? + $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append'); + if ($tag->{'append'}) { + + # I will delimit appended part with + # delimiter (or ,) + my $d = $tag->{'delimiter'}; + # default delimiter + $d ||= " "; + + my $last = pop @{$row->{$type}}; + $d = "" if (! $last); + $last .= $d . join($d, @v); + push @{$row->{$type}}, $last; + + } else { + push @{$row->{$type}}, @v; + } + } + + + } + + if ($row) { + $row->{'tag'} = $field; + + # TODO: name_sigular, name_plural + my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; + $row->{'name'} = $name ? $self->_x($name) : $field; + + # post-sort all values in field + if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { + $log->warn("sort at field tag not implemented"); + } + + push @ds, $row; + + $log->debug("row $field: ",sub { Dumper($row) }); + } + + } + + if ($cache_file) { + store { + ds => \@ds, + current_filename => $self->{'current_filename'}, + headline => $self->{'headline'}, + }, $cache_file; + $log->debug("created storable cache file $cache_file"); + } + + return @ds; + +} + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { + return $self->lookup($out); + } else { + return $out; + } + +} + +=head2 parse + +Perform smart parsing of string, skipping delimiters for fields which aren't +defined. It can also eval code in format starting with C and +return output or nothing depending on eval code. + + my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); + +=cut + +sub parse { + my $self = shift; + + my ($rec, $format_utf8, $i) = @_; + + return if (! $format_utf8); + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + $i = 0 if (! $i); + + my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); + + my @out; + + $log->debug("format: $format"); + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + + my $prefix; + my $all_found=0; + + while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) { + + my $del = $1 || ''; + $prefix ||= $del if ($all_found == 0); + + # repeatable index + my $r = $i; + $r = 0 if (lc("$2") eq 's'); + + my $found = 0; + my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found); + + if ($found) { + push @out, $del; + push @out, $tmp; + $all_found += $found; + } + } + + return if (! $all_found); + + my $out = join('',@out); + + if ($out) { + # add rest of format (suffix) + $out .= $format; + + # add prefix if not there + $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + + $log->debug("result: $out"); + } + + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i) || return; + $log->debug("about to eval{$eval} format: $out"); + return if (! $self->_eval($eval)); + } + + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("about to filter{$filter_name} format: $out"); + $out = $self->{'filter'}->{$filter_name}->($out); + return unless(defined($out)); + $log->debug("filter result: $out"); + } + + return $out; +} + +=head2 parse_to_arr + +Similar to C, but returns array of all repeatable fields + + my @arr = $webpac->parse_to_arr($rec,'v250^a'); + +=cut + +sub parse_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my $v = $self->parse($rec,$format_utf8,$i++)) { + push @arr, $v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + +=head2 fill_in_to_arr + +Similar to C, but returns array of all repeatable fields. Usable +for fields which have lookups, so they shouldn't be parsed but rather +Ced. + + my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]'); + +=cut + +sub fill_in_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my @v = $self->fill_in($rec,$format_utf8,$i++)) { + push @arr, @v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + +=head2 sort_arr + +Sort array ignoring case and html in data + + my @sorted = $webpac->sort_arr(@unsorted); + +=cut + +sub sort_arr { + my $self = shift; + + my $log = $self->_get_logger(); + + # FIXME add Schwartzian Transformation? + + my @sorted = sort { + $a =~ s#<[^>]+/*>##; + $b =~ s#<[^>]+/*>##; + lc($b) cmp lc($a) + } @_; + $log->debug("sorted values: ",sub { join(", ",@sorted) }); + + return @sorted; +} + + +=head2 _sort_by_order + +Sort xml tags data structure accoding to C attribute. + +=cut + +sub _sort_by_order { + my $self = shift; + + my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$a}; + my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$b}; + + return $va <=> $vb; +} + +=head2 _x + +Convert strings from C encoding into application specific +(optinally specified using C to C constructor. + + my $text = $n->_x('normalize text string'); + +This is a stub so that other modules doesn't have to implement it. + +=cut + +sub _x { + my $self = shift; + return shift; +} + + =head1 AUTHOR Dobrica Pavlinusic, C<< >>