lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use blib;
use WebPAC::Common;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.08

=cut

our $VERSION = '0.08';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        if (! @v) {
                                $log->debug("$field <",$self->{tag},"> format: $format no values");
#                               next;
                        } else {
                                $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
                        }

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp;
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
                $just_single = 0;
        }

        # non-repeatable fields
        if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
                return if ($i > 0 && $just_single);
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$fld_occurances);

Required arguments are:

=over 8

=item C<$rec>

record reference

=item C<$f>

field

=item C<$sf>

optional subfield

=item C<$i>

index offset for repeatable values ( 0 ... $#occurances )

=item C<$found>

optional variable that will be incremeted if preset

=item C<$fld_occurances>

hash to hold maximum occurances of C<field\tsubfield> combinations
(which can be accessed using keys in same format)

=back

Returns value or empty string, updates C<$found> and C<fld_occurences>
if present.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found,$cache) = @_;

        return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');

        if (defined($$cache)) {
                $$cache->{"$f\t$sf"} ||= $$#rec->{$f};
        }

        return '' unless ($$rec->{$f}->[$i]);

        {
                no strict 'refs';
                if (defined($sf)) {
                        $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
                        return $$rec->{$f}->[$i]->{$sf};
                } else {
                        $$found++ if (defined($$found));
                        # it still might have subfields, just
                        # not specified, so we'll dump some debug info
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	368	use blib;
6			use WebPAC::Common;
7	dpavlin	29	use base 'WebPAC::Common';
8	dpavlin	13	use Data::Dumper;
9	dpavlin	10
10			=head1 NAME
11
12	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
13	dpavlin	10
14			=head1 VERSION
15
16	dpavlin	317	Version 0.08
17	dpavlin	10
18			=cut
19
20	dpavlin	317	our $VERSION = '0.08';
21	dpavlin	10
22			=head1 SYNOPSIS
23
24	dpavlin	15	This package contains code that mungle data to produce normalized format.
25	dpavlin	10
26	dpavlin	15	It contains several assumptions:
27
28			=over
29
30			=item *
31
32			format of fields is defined using C<v123^a> notation for repeatable fields
33			or C<s123^a> for single (or first) value, where C<123> is field number and
34			C<a> is subfield.
35
36			=item *
37
38			source data records (C<$rec>) have unique identifiers in field C<000>
39
40			=item *
41
42			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43			perl code that is evaluated before producing output (value of field will be
44			interpolated before that)
45
46			=item *
47
48			optional C<filter{filter_name}> at B<begining of format> will apply perl
49			code defined as code ref on format after field substitution to producing
50			output
51
52	dpavlin	260	There is one built-in filter called C<regex> which can be use like this:
53
54			filter{regex(s/foo/bar/)}
55
56	dpavlin	15	=item *
57
58			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59
60			=item *
61
62			at end, optional C<format>s rules are resolved. Format rules are similar to
63			C<sprintf> and can also contain C<lookup{...}> which is performed after
64			values are inserted in format.
65
66			=back
67
68			This also describes order in which transformations are applied (eval,
69			filter, lookup, format) which is important to undestand when deciding how to
70			solve your data mungling and normalisation process.
71
72
73
74
75	dpavlin	10	=head1 FUNCTIONS
76
77	dpavlin	13	=head2 new
78	dpavlin	10
79	dpavlin	13	Create new normalisation object
80
81			my $n = new WebPAC::Normalize::Something(
82	dpavlin	15	filter => {
83			'filter_name_1' => sub {
84			# filter code
85			return length($_);
86			}, ...
87			},
88	dpavlin	29	db => $db_obj,
89	dpavlin	13	lookup_regex => $lookup->regex,
90	dpavlin	31	lookup => $lookup_obj,
91	dpavlin	219	prefix => 'foobar',
92	dpavlin	13	);
93
94	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
95			be use with C<filter{...}> notation.
96
97	dpavlin	219	C<prefix> is used to form filename for database record (to support multiple
98			source files which are joined in one database).
99
100	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
102			which is C<WebPAC::Lookup> object.
103	dpavlin	13
104	dpavlin	10	=cut
105
106	dpavlin	13	sub new {
107			my $class = shift;
108			my $self = {@_};
109			bless($self, $class);
110
111	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
112			my $l = $self->{'lookup'} ? 1 : 0;
113
114			my $log = $self->_get_logger();
115
116			# those two must be in pair
117			if ( ($r & $l) != ($r \|\| $l) ) {
118			my $log = $self->_get_logger();
119			$log->logdie("lookup_regex and lookup must be in pair");
120			}
121
122			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123
124	dpavlin	219	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125
126	dpavlin	252	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127
128	dpavlin	295	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
129	dpavlin	260	$log->debug("adding built-in filter regex");
130			$self->{filter}->{regex} = sub {
131			my ($val, $regex) = @_;
132			eval "\$val =~ $regex";
133			return $val;
134			};
135			}
136
137	dpavlin	13	$self ? return $self : return undef;
138	dpavlin	10	}
139
140	dpavlin	13
141			=head2 data_structure
142
143			Create in-memory data structure which represents normalized layout from
144			C<conf/normalize/*.xml>.
145
146			This structures are used to produce output.
147
148	dpavlin	70	my $ds = $webpac->data_structure($rec);
149	dpavlin	13
150			=cut
151
152			sub data_structure {
153			my $self = shift;
154
155			my $log = $self->_get_logger();
156
157			my $rec = shift;
158			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159
160	dpavlin	125	$log->debug("data_structure rec = ", sub { Dumper($rec) });
161
162	dpavlin	312	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163	dpavlin	125
164	dpavlin	219	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
165	dpavlin	125
166	dpavlin	13	my $cache_file;
167
168	dpavlin	18	if ($self->{'db'}) {
169	dpavlin	219	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170	dpavlin	70	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171			return $ds if ($ds);
172	dpavlin	29	$log->debug("cache miss, creating");
173	dpavlin	13	}
174
175			my @sorted_tags;
176			if ($self->{tags_by_order}) {
177			@sorted_tags = @{$self->{tags_by_order}};
178			} else {
179			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180			$self->{tags_by_order} = \@sorted_tags;
181			}
182
183	dpavlin	70	my $ds;
184	dpavlin	13
185			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
186
187			foreach my $field (@sorted_tags) {
188
189			my $row;
190
191			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192
193			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194	dpavlin	38	my $format;
195	dpavlin	13
196	dpavlin	38	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197			$format = $tag->{'value'} \|\| $tag->{'content'};
198
199	dpavlin	13	my @v;
200			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201			@v = $self->fill_in_to_arr($rec,$format);
202			} else {
203			@v = $self->parse_to_arr($rec,$format);
204			}
205	dpavlin	364	if (! @v) {
206			$log->debug("$field <",$self->{tag},"> format: $format no values");
207			# next;
208			} else {
209			$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210			}
211	dpavlin	13
212			if ($tag->{'sort'}) {
213			@v = $self->sort_arr(@v);
214			}
215
216			# use format?
217			if ($tag->{'format_name'}) {
218			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219			}
220
221			# delimiter will join repeatable fields
222			if ($tag->{'delimiter'}) {
223			@v = ( join($tag->{'delimiter'}, @v) );
224			}
225
226			# default types
227	dpavlin	74	my @types = qw(display search);
228	dpavlin	13	# override by type attribute
229			@types = ( $tag->{'type'} ) if ($tag->{'type'});
230
231			foreach my $type (@types) {
232			# append to previous line?
233	dpavlin	364	$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} \|\| 'no append');
234	dpavlin	13	if ($tag->{'append'}) {
235
236			# I will delimit appended part with
237			# delimiter (or ,)
238			my $d = $tag->{'delimiter'};
239			# default delimiter
240			$d \|\|= " ";
241
242			my $last = pop @{$row->{$type}};
243			$d = "" if (! $last);
244			$last .= $d . join($d, @v);
245			push @{$row->{$type}}, $last;
246
247			} else {
248			push @{$row->{$type}}, @v;
249			}
250			}
251
252
253			}
254
255			if ($row) {
256			$row->{'tag'} = $field;
257
258			# TODO: name_sigular, name_plural
259			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260	dpavlin	70	my $row_name = $name ? $self->_x($name) : $field;
261	dpavlin	13
262			# post-sort all values in field
263			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264			$log->warn("sort at field tag not implemented");
265			}
266
267	dpavlin	70	$ds->{$row_name} = $row;
268	dpavlin	13
269			$log->debug("row $field: ",sub { Dumper($row) });
270			}
271
272			}
273
274	dpavlin	22	$self->{'db'}->save_ds(
275	dpavlin	219	id => $id,
276	dpavlin	70	ds => $ds,
277	dpavlin	219	prefix => $self->{prefix},
278	dpavlin	18	) if ($self->{'db'});
279	dpavlin	13
280	dpavlin	70	$log->debug("ds: ", sub { Dumper($ds) });
281	dpavlin	29
282	dpavlin	70	$log->logconfess("data structure returned is not array any more!") if wantarray;
283	dpavlin	13
284	dpavlin	70	return $ds;
285
286	dpavlin	13	}
287
288			=head2 parse
289
290			Perform smart parsing of string, skipping delimiters for fields which aren't
291			defined. It can also eval code in format starting with C<eval{...}> and
292			return output or nothing depending on eval code.
293
294			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295
296	dpavlin	260	Filters are implemented here. While simple form of filters looks like this:
297
298			filter{name_of_filter}
299
300			but, filters can also have variable number of parametars like this:
301
302			filter{name_of_filter(param,param,param)}
303
304	dpavlin	13	=cut
305
306	dpavlin	261	my $warn_once;
307
308	dpavlin	13	sub parse {
309			my $self = shift;
310
311			my ($rec, $format_utf8, $i) = @_;
312
313			return if (! $format_utf8);
314
315			my $log = $self->_get_logger();
316
317			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318
319			$i = 0 if (! $i);
320
321			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322
323			my @out;
324
325	dpavlin	340	$log->debug("format: $format [$i]");
326	dpavlin	13
327			my $eval_code;
328			# remove eval{...} from beginning
329			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330
331			my $filter_name;
332			# remove filter{...} from beginning
333			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334
335	dpavlin	317	# did we found any (att all) field from format in row?
336	dpavlin	340	my $found_any;
337	dpavlin	317	# prefix before first field which we preserve it $found_any
338	dpavlin	13	my $prefix;
339
340	dpavlin	317	my $f_step = 1;
341
342	dpavlin	13	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
343
344			my $del = $1 \|\| '';
345	dpavlin	317	$prefix = $del if ($f_step == 1);
346	dpavlin	13
347	dpavlin	340	my $fld_type = lc($2);
348
349	dpavlin	13	# repeatable index
350			my $r = $i;
351	dpavlin	340	if ($fld_type eq 's') {
352			if ($found_any->{'v'}) {
353			$r = 0;
354			} else {
355			return;
356			}
357			}
358	dpavlin	13
359			my $found = 0;
360			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
361
362			if ($found) {
363	dpavlin	340	$found_any->{$fld_type} += $found;
364	dpavlin	317
365			# we will skip delimiter before first occurence of field!
366	dpavlin	344	push @out, $del unless($found_any->{$fld_type} == 1);
367	dpavlin	13	push @out, $tmp;
368			}
369	dpavlin	317	$f_step++;
370	dpavlin	13	}
371
372	dpavlin	340	# test if any fields found?
373			return if (! $found_any->{'v'} && ! $found_any->{'s'});
374	dpavlin	13
375			my $out = join('',@out);
376
377			if ($out) {
378			# add rest of format (suffix)
379			$out .= $format;
380
381			# add prefix if not there
382			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383
384			$log->debug("result: $out");
385			}
386
387			if ($eval_code) {
388			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
389			$log->debug("about to eval{$eval} format: $out");
390			return if (! $self->_eval($eval));
391			}
392
393	dpavlin	260	if ($filter_name) {
394			my @filter_args;
395			if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396			@filter_args = split(/,/, $2);
397			}
398			if ($self->{'filter'}->{$filter_name}) {
399			$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400			unshift @filter_args, $out;
401			$out = $self->{'filter'}->{$filter_name}->(@filter_args);
402			return unless(defined($out));
403			$log->debug("filter result: $out");
404	dpavlin	261	} elsif (! $warn_once->{$filter_name}) {
405	dpavlin	260	$log->warn("trying to use undefined filter $filter_name");
406	dpavlin	261	$warn_once->{$filter_name}++;
407	dpavlin	260	}
408	dpavlin	13	}
409
410			return $out;
411			}
412
413			=head2 parse_to_arr
414
415			Similar to C<parse>, but returns array of all repeatable fields
416
417			my @arr = $webpac->parse_to_arr($rec,'v250^a');
418
419			=cut
420
421			sub parse_to_arr {
422			my $self = shift;
423
424			my ($rec, $format_utf8) = @_;
425
426			my $log = $self->_get_logger();
427
428			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429			return if (! $format_utf8);
430
431			my $i = 0;
432			my @arr;
433
434			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
435			push @arr, $v;
436			}
437
438			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
439
440			return @arr;
441			}
442
443	dpavlin	15
444			=head2 fill_in
445
446			Workhourse of all: takes record from in-memory structure of database and
447			strings with placeholders and returns string or array of with substituted
448			values from record.
449
450			my $text = $webpac->fill_in($rec,'v250^a');
451
452			Optional argument is ordinal number for repeatable fields. By default,
453			it's assume to be first repeatable field (fields are perl array, so first
454			element is 0).
455			Following example will read second value from repeatable field.
456
457			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458
459			This function B<does not> perform parsing of format to inteligenty skip
460			delimiters before fields which aren't used.
461
462			This method will automatically decode UTF-8 string to local code page
463			if needed.
464
465			=cut
466
467			sub fill_in {
468			my $self = shift;
469
470			my $log = $self->_get_logger();
471
472			my $rec = shift \|\| $log->logconfess("need data record");
473			my $format = shift \|\| $log->logconfess("need format to parse");
474			# iteration (for repeatable fields)
475			my $i = shift \|\| 0;
476
477			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
478
479			# FIXME remove for speedup?
480			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481
482			if (utf8::is_utf8($format)) {
483			$format = $self->_x($format);
484			}
485
486			my $found = 0;
487	dpavlin	333	my $just_single = 1;
488	dpavlin	15
489			my $eval_code;
490			# remove eval{...} from beginning
491			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
492
493			my $filter_name;
494			# remove filter{...} from beginning
495			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
496
497			# do actual replacement of placeholders
498			# repeatable fields
499	dpavlin	333	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
500			$just_single = 0;
501			}
502	dpavlin	364
503	dpavlin	15	# non-repeatable fields
504	dpavlin	333	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
505			return if ($i > 0 && $just_single);
506			}
507	dpavlin	15
508			if ($found) {
509			$log->debug("format: $format");
510			if ($eval_code) {
511			my $eval = $self->fill_in($rec,$eval_code,$i);
512			return if (! $self->_eval($eval));
513			}
514			if ($filter_name && $self->{'filter'}->{$filter_name}) {
515			$log->debug("filter '$filter_name' for $format");
516			$format = $self->{'filter'}->{$filter_name}->($format);
517			return unless(defined($format));
518			$log->debug("filter result: $format");
519			}
520			# do we have lookups?
521			if ($self->{'lookup'}) {
522	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
523	dpavlin	252	my @lookup = $self->{lookup}->lookup($format);
524	dpavlin	253	$log->debug("lookup $format", join(", ", @lookup));
525	dpavlin	252	return @lookup;
526	dpavlin	31	} else {
527			$log->warn("Have lookup object but can't invoke lookup method");
528			}
529	dpavlin	15	} else {
530			return $format;
531			}
532			} else {
533			return;
534			}
535			}
536
537
538	dpavlin	13	=head2 fill_in_to_arr
539
540			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
541			for fields which have lookups, so they shouldn't be parsed but rather
542			C<fill_id>ed.
543
544			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
545
546			=cut
547
548			sub fill_in_to_arr {
549			my $self = shift;
550
551			my ($rec, $format_utf8) = @_;
552
553			my $log = $self->_get_logger();
554
555			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
556			return if (! $format_utf8);
557
558			my $i = 0;
559			my @arr;
560
561	dpavlin	364	while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
562			push @arr, $v;
563	dpavlin	13	}
564
565			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
566
567			return @arr;
568			}
569
570	dpavlin	15
571			=head2 get_data
572
573			Returns value from record.
574
575	dpavlin	368	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$fld_occurances);
576	dpavlin	15
577	dpavlin	368	Required arguments are:
578	dpavlin	15
579	dpavlin	368	=over 8
580	dpavlin	15
581	dpavlin	368	=item C<$rec>
582	dpavlin	15
583	dpavlin	368	record reference
584
585			=item C<$f>
586
587			field
588
589			=item C<$sf>
590
591			optional subfield
592
593			=item C<$i>
594
595			index offset for repeatable values ( 0 ... $#occurances )
596
597			=item C<$found>
598
599			optional variable that will be incremeted if preset
600
601			=item C<$fld_occurances>
602
603			hash to hold maximum occurances of C<field\tsubfield> combinations
604			(which can be accessed using keys in same format)
605
606			=back
607
608			Returns value or empty string, updates C<$found> and C<fld_occurences>
609			if present.
610
611	dpavlin	15	=cut
612
613			sub get_data {
614			my $self = shift;
615
616	dpavlin	368	my ($rec,$f,$sf,$i,$found,$cache) = @_;
617	dpavlin	15
618	dpavlin	368	return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
619
620			if (defined($$cache)) {
621			$$cache->{"$f\t$sf"} \|\|= $$#rec->{$f};
622			}
623
624			return '' unless ($$rec->{$f}->[$i]);
625
626			{
627	dpavlin	15	no strict 'refs';
628	dpavlin	368	if (defined($sf)) {
629			$$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
630	dpavlin	15	return $$rec->{$f}->[$i]->{$sf};
631	dpavlin	368	} else {
632	dpavlin	15	$$found++ if (defined($$found));
633	dpavlin	368	# it still might have subfields, just
634			# not specified, so we'll dump some debug info
635	dpavlin	15	if ($$rec->{$f}->[$i] =~ /HASH/o) {
636			my $out;
637			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
638	dpavlin	368	$out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
639	dpavlin	15	}
640			return $out;
641			} else {
642			return $$rec->{$f}->[$i];
643			}
644			}
645			}
646			}
647
648
649			=head2 apply_format
650
651			Apply format specified in tag with C<format_name="name"> and
652			C<format_delimiter=";;">.
653
654			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
655
656			Formats can contain C<lookup{...}> if you need them.
657
658			=cut
659
660			sub apply_format {
661			my $self = shift;
662
663			my ($name,$delimiter,$data) = @_;
664
665			my $log = $self->_get_logger();
666
667			if (! $self->{'import_xml'}->{'format'}->{$name}) {
668			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
669			return $data;
670			}
671
672			$log->warn("no delimiter for format $name") if (! $delimiter);
673
674			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
675
676			my @data = split(/\Q$delimiter\E/, $data);
677
678			my $out = sprintf($format, @data);
679			$log->debug("using format $name [$format] on $data to produce: $out");
680
681			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
682	dpavlin	31	return $self->{'lookup'}->lookup($out);
683	dpavlin	15	} else {
684			return $out;
685			}
686
687			}
688
689	dpavlin	13	=head2 sort_arr
690
691			Sort array ignoring case and html in data
692
693			my @sorted = $webpac->sort_arr(@unsorted);
694
695			=cut
696
697			sub sort_arr {
698			my $self = shift;
699
700			my $log = $self->_get_logger();
701
702			# FIXME add Schwartzian Transformation?
703
704			my @sorted = sort {
705			$a =~ s#<[^>]+/*>##;
706			$b =~ s#<[^>]+/*>##;
707			lc($b) cmp lc($a)
708			} @_;
709			$log->debug("sorted values: ",sub { join(", ",@sorted) });
710
711			return @sorted;
712			}
713
714
715	dpavlin	15	=head1 INTERNAL METHODS
716
717	dpavlin	13	=head2 _sort_by_order
718
719			Sort xml tags data structure accoding to C<order=""> attribute.
720
721			=cut
722
723			sub _sort_by_order {
724			my $self = shift;
725
726			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
727			$self->{'import_xml'}->{'indexer'}->{$a};
728			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
729			$self->{'import_xml'}->{'indexer'}->{$b};
730
731			return $va <=> $vb;
732			}
733
734			=head2 _x
735
736	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
737			specific encoding (optinally specified using C<code_page> to C<new>
738			constructor).
739	dpavlin	13
740			my $text = $n->_x('normalize text string');
741
742			This is a stub so that other modules doesn't have to implement it.
743
744			=cut
745
746			sub _x {
747			my $self = shift;
748			return shift;
749			}
750
751
752	dpavlin	10	=head1 AUTHOR
753
754			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
755
756			=head1 COPYRIGHT & LICENSE
757
758			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
759
760			This program is free software; you can redistribute it and/or modify it
761			under the same terms as Perl itself.
762
763			=cut
764
765	dpavlin	209	1; # End of WebPAC::Normalize