lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use blib;
use WebPAC::Common;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.08

=cut

our $VERSION = '0.08';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        if (! @v) {
                                $log->debug("$field <",$self->{tag},"> format: $format no values");
#                               next;
                        } else {
                                $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
                        }

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i, $rec_size) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp;
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        my $rec_size = { '_' => '_' };

        while (my $v = $self->parse($rec,$format_utf8,$i++,\$rec_size)) {
                push @arr, $v;
                warn "parse rec_size = ", Dumper($rec_size);
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

There is optional parametar C<$record_size> which can be used to get sizes of
all C<field^subfield> combinations in this format.

 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my ($rec,$format,$i,$rec_size) = @_;

        $log->logconfess("need data record") unless ($rec);
        $log->logconfess("need format to parse") unless($format);

        # iteration (for repeatable fields)
        $i ||= 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
                $just_single = 0;
        }

        # non-repeatable fields
        if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
                return if ($i > 0 && $just_single);
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        my $rec_size;

        while (my $v = $self->fill_in($rec,$format_utf8,$i,\$rec_size)) {
                push @arr, $v;
                warn "rec_size = ", Dumper($rec_size);
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);

Required arguments are:

=over 8

=item C<$rec>

record reference

=item C<$f>

field

=item C<$sf>

optional subfield

=item C<$i>

index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )

=item C<$found>

optional variable that will be incremeted if preset

=item C<$rec_size>

hash to hold maximum occurances of C<field^subfield> combinations
(which can be accessed using keys in same format)

=back

Returns value or empty string, updates C<$found> and C<rec_size>
if present.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found,$cache) = @_;

        return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');

        if (defined($$cache)) {
                $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
        }

        return '' unless ($$rec->{$f}->[$i]);

        {
                no strict 'refs';
                if (defined($sf)) {
                        $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
                        return $$rec->{$f}->[$i]->{$sf};
                } else {
                        $$found++ if (defined($$found));
                        # it still might have subfields, just
                        # not specified, so we'll dump some debug info
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use blib;
6	use WebPAC::Common;
7	use base 'WebPAC::Common';
8	use Data::Dumper;
9
10	=head1 NAME
11
12	WebPAC::Normalize - data mungling for normalisation
13
14	=head1 VERSION
15
16	Version 0.08
17
18	=cut
19
20	our $VERSION = '0.08';
21
22	=head1 SYNOPSIS
23
24	This package contains code that mungle data to produce normalized format.
25
26	It contains several assumptions:
27
28	=over
29
30	=item *
31
32	format of fields is defined using C<v123^a> notation for repeatable fields
33	or C<s123^a> for single (or first) value, where C<123> is field number and
34	C<a> is subfield.
35
36	=item *
37
38	source data records (C<$rec>) have unique identifiers in field C<000>
39
40	=item *
41
42	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43	perl code that is evaluated before producing output (value of field will be
44	interpolated before that)
45
46	=item *
47
48	optional C<filter{filter_name}> at B<begining of format> will apply perl
49	code defined as code ref on format after field substitution to producing
50	output
51
52	There is one built-in filter called C<regex> which can be use like this:
53
54	filter{regex(s/foo/bar/)}
55
56	=item *
57
58	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59
60	=item *
61
62	at end, optional C<format>s rules are resolved. Format rules are similar to
63	C<sprintf> and can also contain C<lookup{...}> which is performed after
64	values are inserted in format.
65
66	=back
67
68	This also describes order in which transformations are applied (eval,
69	filter, lookup, format) which is important to undestand when deciding how to
70	solve your data mungling and normalisation process.
71
72
73
74
75	=head1 FUNCTIONS
76
77	=head2 new
78
79	Create new normalisation object
80
81	my $n = new WebPAC::Normalize::Something(
82	filter => {
83	'filter_name_1' => sub {
84	# filter code
85	return length($_);
86	}, ...
87	},
88	db => $db_obj,
89	lookup_regex => $lookup->regex,
90	lookup => $lookup_obj,
91	prefix => 'foobar',
92	);
93
94	Parametar C<filter> defines user supplied snippets of perl code which can
95	be use with C<filter{...}> notation.
96
97	C<prefix> is used to form filename for database record (to support multiple
98	source files which are joined in one database).
99
100	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101	in structures. If you pass this parametar, you must also pass C<lookup>
102	which is C<WebPAC::Lookup> object.
103
104	=cut
105
106	sub new {
107	my $class = shift;
108	my $self = {@_};
109	bless($self, $class);
110
111	my $r = $self->{'lookup_regex'} ? 1 : 0;
112	my $l = $self->{'lookup'} ? 1 : 0;
113
114	my $log = $self->_get_logger();
115
116	# those two must be in pair
117	if ( ($r & $l) != ($r \|\| $l) ) {
118	my $log = $self->_get_logger();
119	$log->logdie("lookup_regex and lookup must be in pair");
120	}
121
122	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123
124	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125
126	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127
128	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
129	$log->debug("adding built-in filter regex");
130	$self->{filter}->{regex} = sub {
131	my ($val, $regex) = @_;
132	eval "\$val =~ $regex";
133	return $val;
134	};
135	}
136
137	$self ? return $self : return undef;
138	}
139
140
141	=head2 data_structure
142
143	Create in-memory data structure which represents normalized layout from
144	C<conf/normalize/*.xml>.
145
146	This structures are used to produce output.
147
148	my $ds = $webpac->data_structure($rec);
149
150	=cut
151
152	sub data_structure {
153	my $self = shift;
154
155	my $log = $self->_get_logger();
156
157	my $rec = shift;
158	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159
160	$log->debug("data_structure rec = ", sub { Dumper($rec) });
161
162	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163
164	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
165
166	my $cache_file;
167
168	if ($self->{'db'}) {
169	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171	return $ds if ($ds);
172	$log->debug("cache miss, creating");
173	}
174
175	my @sorted_tags;
176	if ($self->{tags_by_order}) {
177	@sorted_tags = @{$self->{tags_by_order}};
178	} else {
179	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180	$self->{tags_by_order} = \@sorted_tags;
181	}
182
183	my $ds;
184
185	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
186
187	foreach my $field (@sorted_tags) {
188
189	my $row;
190
191	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192
193	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194	my $format;
195
196	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197	$format = $tag->{'value'} \|\| $tag->{'content'};
198
199	my @v;
200	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201	@v = $self->fill_in_to_arr($rec,$format);
202	} else {
203	@v = $self->parse_to_arr($rec,$format);
204	}
205	if (! @v) {
206	$log->debug("$field <",$self->{tag},"> format: $format no values");
207	# next;
208	} else {
209	$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210	}
211
212	if ($tag->{'sort'}) {
213	@v = $self->sort_arr(@v);
214	}
215
216	# use format?
217	if ($tag->{'format_name'}) {
218	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219	}
220
221	# delimiter will join repeatable fields
222	if ($tag->{'delimiter'}) {
223	@v = ( join($tag->{'delimiter'}, @v) );
224	}
225
226	# default types
227	my @types = qw(display search);
228	# override by type attribute
229	@types = ( $tag->{'type'} ) if ($tag->{'type'});
230
231	foreach my $type (@types) {
232	# append to previous line?
233	$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} \|\| 'no append');
234	if ($tag->{'append'}) {
235
236	# I will delimit appended part with
237	# delimiter (or ,)
238	my $d = $tag->{'delimiter'};
239	# default delimiter
240	$d \|\|= " ";
241
242	my $last = pop @{$row->{$type}};
243	$d = "" if (! $last);
244	$last .= $d . join($d, @v);
245	push @{$row->{$type}}, $last;
246
247	} else {
248	push @{$row->{$type}}, @v;
249	}
250	}
251
252
253	}
254
255	if ($row) {
256	$row->{'tag'} = $field;
257
258	# TODO: name_sigular, name_plural
259	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260	my $row_name = $name ? $self->_x($name) : $field;
261
262	# post-sort all values in field
263	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264	$log->warn("sort at field tag not implemented");
265	}
266
267	$ds->{$row_name} = $row;
268
269	$log->debug("row $field: ",sub { Dumper($row) });
270	}
271
272	}
273
274	$self->{'db'}->save_ds(
275	id => $id,
276	ds => $ds,
277	prefix => $self->{prefix},
278	) if ($self->{'db'});
279
280	$log->debug("ds: ", sub { Dumper($ds) });
281
282	$log->logconfess("data structure returned is not array any more!") if wantarray;
283
284	return $ds;
285
286	}
287
288	=head2 parse
289
290	Perform smart parsing of string, skipping delimiters for fields which aren't
291	defined. It can also eval code in format starting with C<eval{...}> and
292	return output or nothing depending on eval code.
293
294	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295
296	Filters are implemented here. While simple form of filters looks like this:
297
298	filter{name_of_filter}
299
300	but, filters can also have variable number of parametars like this:
301
302	filter{name_of_filter(param,param,param)}
303
304	=cut
305
306	my $warn_once;
307
308	sub parse {
309	my $self = shift;
310
311	my ($rec, $format_utf8, $i, $rec_size) = @_;
312
313	return if (! $format_utf8);
314
315	my $log = $self->_get_logger();
316
317	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318
319	$i = 0 if (! $i);
320
321	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322
323	my @out;
324
325	$log->debug("format: $format [$i]");
326
327	my $eval_code;
328	# remove eval{...} from beginning
329	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330
331	my $filter_name;
332	# remove filter{...} from beginning
333	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334
335	# did we found any (att all) field from format in row?
336	my $found_any;
337	# prefix before first field which we preserve it $found_any
338	my $prefix;
339
340	my $f_step = 1;
341
342	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
343
344	my $del = $1 \|\| '';
345	$prefix = $del if ($f_step == 1);
346
347	my $fld_type = lc($2);
348
349	# repeatable index
350	my $r = $i;
351	if ($fld_type eq 's') {
352	if ($found_any->{'v'}) {
353	$r = 0;
354	} else {
355	return;
356	}
357	}
358
359	my $found = 0;
360	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
361
362	if ($found) {
363	$found_any->{$fld_type} += $found;
364
365	# we will skip delimiter before first occurence of field!
366	push @out, $del unless($found_any->{$fld_type} == 1);
367	push @out, $tmp;
368	}
369	$f_step++;
370	}
371
372	# test if any fields found?
373	return if (! $found_any->{'v'} && ! $found_any->{'s'});
374
375	my $out = join('',@out);
376
377	if ($out) {
378	# add rest of format (suffix)
379	$out .= $format;
380
381	# add prefix if not there
382	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383
384	$log->debug("result: $out");
385	}
386
387	if ($eval_code) {
388	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
389	$log->debug("about to eval{$eval} format: $out");
390	return if (! $self->_eval($eval));
391	}
392
393	if ($filter_name) {
394	my @filter_args;
395	if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396	@filter_args = split(/,/, $2);
397	}
398	if ($self->{'filter'}->{$filter_name}) {
399	$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400	unshift @filter_args, $out;
401	$out = $self->{'filter'}->{$filter_name}->(@filter_args);
402	return unless(defined($out));
403	$log->debug("filter result: $out");
404	} elsif (! $warn_once->{$filter_name}) {
405	$log->warn("trying to use undefined filter $filter_name");
406	$warn_once->{$filter_name}++;
407	}
408	}
409
410	return $out;
411	}
412
413	=head2 parse_to_arr
414
415	Similar to C<parse>, but returns array of all repeatable fields
416
417	my @arr = $webpac->parse_to_arr($rec,'v250^a');
418
419	=cut
420
421	sub parse_to_arr {
422	my $self = shift;
423
424	my ($rec, $format_utf8) = @_;
425
426	my $log = $self->_get_logger();
427
428	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429	return if (! $format_utf8);
430
431	my $i = 0;
432	my @arr;
433
434	my $rec_size = { '_' => '_' };
435
436	while (my $v = $self->parse($rec,$format_utf8,$i++,\$rec_size)) {
437	push @arr, $v;
438	warn "parse rec_size = ", Dumper($rec_size);
439	}
440
441	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
442
443	return @arr;
444	}
445
446
447	=head2 fill_in
448
449	Workhourse of all: takes record from in-memory structure of database and
450	strings with placeholders and returns string or array of with substituted
451	values from record.
452
453	my $text = $webpac->fill_in($rec,'v250^a');
454
455	Optional argument is ordinal number for repeatable fields. By default,
456	it's assume to be first repeatable field (fields are perl array, so first
457	element is 0).
458	Following example will read second value from repeatable field.
459
460	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
461
462	This function B<does not> perform parsing of format to inteligenty skip
463	delimiters before fields which aren't used.
464
465	This method will automatically decode UTF-8 string to local code page
466	if needed.
467
468	There is optional parametar C<$record_size> which can be used to get sizes of
469	all C<field^subfield> combinations in this format.
470
471	my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
472
473	=cut
474
475	sub fill_in {
476	my $self = shift;
477
478	my $log = $self->_get_logger();
479
480	my ($rec,$format,$i,$rec_size) = @_;
481
482	$log->logconfess("need data record") unless ($rec);
483	$log->logconfess("need format to parse") unless($format);
484
485	# iteration (for repeatable fields)
486	$i \|\|= 0;
487
488	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
489
490	# FIXME remove for speedup?
491	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
492
493	if (utf8::is_utf8($format)) {
494	$format = $self->_x($format);
495	}
496
497	my $found = 0;
498	my $just_single = 1;
499
500	my $eval_code;
501	# remove eval{...} from beginning
502	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
503
504	my $filter_name;
505	# remove filter{...} from beginning
506	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
507
508	# do actual replacement of placeholders
509	# repeatable fields
510	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
511	$just_single = 0;
512	}
513
514	# non-repeatable fields
515	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
516	return if ($i > 0 && $just_single);
517	}
518
519	if ($found) {
520	$log->debug("format: $format");
521	if ($eval_code) {
522	my $eval = $self->fill_in($rec,$eval_code,$i);
523	return if (! $self->_eval($eval));
524	}
525	if ($filter_name && $self->{'filter'}->{$filter_name}) {
526	$log->debug("filter '$filter_name' for $format");
527	$format = $self->{'filter'}->{$filter_name}->($format);
528	return unless(defined($format));
529	$log->debug("filter result: $format");
530	}
531	# do we have lookups?
532	if ($self->{'lookup'}) {
533	if ($self->{'lookup'}->can('lookup')) {
534	my @lookup = $self->{lookup}->lookup($format);
535	$log->debug("lookup $format", join(", ", @lookup));
536	return @lookup;
537	} else {
538	$log->warn("Have lookup object but can't invoke lookup method");
539	}
540	} else {
541	return $format;
542	}
543	} else {
544	return;
545	}
546	}
547
548
549	=head2 fill_in_to_arr
550
551	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
552	for fields which have lookups, so they shouldn't be parsed but rather
553	C<fill_id>ed.
554
555	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
556
557	=cut
558
559	sub fill_in_to_arr {
560	my $self = shift;
561
562	my ($rec, $format_utf8) = @_;
563
564	my $log = $self->_get_logger();
565
566	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
567	return if (! $format_utf8);
568
569	my $i = 0;
570	my @arr;
571
572	my $rec_size;
573
574	while (my $v = $self->fill_in($rec,$format_utf8,$i,\$rec_size)) {
575	push @arr, $v;
576	warn "rec_size = ", Dumper($rec_size);
577	}
578
579	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
580
581	return @arr;
582	}
583
584
585	=head2 get_data
586
587	Returns value from record.
588
589	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
590
591	Required arguments are:
592
593	=over 8
594
595	=item C<$rec>
596
597	record reference
598
599	=item C<$f>
600
601	field
602
603	=item C<$sf>
604
605	optional subfield
606
607	=item C<$i>
608
609	index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
610
611	=item C<$found>
612
613	optional variable that will be incremeted if preset
614
615	=item C<$rec_size>
616
617	hash to hold maximum occurances of C<field^subfield> combinations
618	(which can be accessed using keys in same format)
619
620	=back
621
622	Returns value or empty string, updates C<$found> and C<rec_size>
623	if present.
624
625	=cut
626
627	sub get_data {
628	my $self = shift;
629
630	my ($rec,$f,$sf,$i,$found,$cache) = @_;
631
632	return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
633
634	if (defined($$cache)) {
635	$$cache->{ $f . ( $sf ? '^' . $sf : '' ) } \|\|= scalar @{ $$rec->{$f} };
636	}
637
638	return '' unless ($$rec->{$f}->[$i]);
639
640	{
641	no strict 'refs';
642	if (defined($sf)) {
643	$$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
644	return $$rec->{$f}->[$i]->{$sf};
645	} else {
646	$$found++ if (defined($$found));
647	# it still might have subfields, just
648	# not specified, so we'll dump some debug info
649	if ($$rec->{$f}->[$i] =~ /HASH/o) {
650	my $out;
651	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
652	$out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
653	}
654	return $out;
655	} else {
656	return $$rec->{$f}->[$i];
657	}
658	}
659	}
660	}
661
662
663	=head2 apply_format
664
665	Apply format specified in tag with C<format_name="name"> and
666	C<format_delimiter=";;">.
667
668	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
669
670	Formats can contain C<lookup{...}> if you need them.
671
672	=cut
673
674	sub apply_format {
675	my $self = shift;
676
677	my ($name,$delimiter,$data) = @_;
678
679	my $log = $self->_get_logger();
680
681	if (! $self->{'import_xml'}->{'format'}->{$name}) {
682	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
683	return $data;
684	}
685
686	$log->warn("no delimiter for format $name") if (! $delimiter);
687
688	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
689
690	my @data = split(/\Q$delimiter\E/, $data);
691
692	my $out = sprintf($format, @data);
693	$log->debug("using format $name [$format] on $data to produce: $out");
694
695	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
696	return $self->{'lookup'}->lookup($out);
697	} else {
698	return $out;
699	}
700
701	}
702
703	=head2 sort_arr
704
705	Sort array ignoring case and html in data
706
707	my @sorted = $webpac->sort_arr(@unsorted);
708
709	=cut
710
711	sub sort_arr {
712	my $self = shift;
713
714	my $log = $self->_get_logger();
715
716	# FIXME add Schwartzian Transformation?
717
718	my @sorted = sort {
719	$a =~ s#<[^>]+/*>##;
720	$b =~ s#<[^>]+/*>##;
721	lc($b) cmp lc($a)
722	} @_;
723	$log->debug("sorted values: ",sub { join(", ",@sorted) });
724
725	return @sorted;
726	}
727
728
729	=head1 INTERNAL METHODS
730
731	=head2 _sort_by_order
732
733	Sort xml tags data structure accoding to C<order=""> attribute.
734
735	=cut
736
737	sub _sort_by_order {
738	my $self = shift;
739
740	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
741	$self->{'import_xml'}->{'indexer'}->{$a};
742	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
743	$self->{'import_xml'}->{'indexer'}->{$b};
744
745	return $va <=> $vb;
746	}
747
748	=head2 _x
749
750	Convert strings from C<conf/normalize/*.xml> encoding into application
751	specific encoding (optinally specified using C<code_page> to C<new>
752	constructor).
753
754	my $text = $n->_x('normalize text string');
755
756	This is a stub so that other modules doesn't have to implement it.
757
758	=cut
759
760	sub _x {
761	my $self = shift;
762	return shift;
763	}
764
765
766	=head1 AUTHOR
767
768	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
769
770	=head1 COPYRIGHT & LICENSE
771
772	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
773
774	This program is free software; you can redistribute it and/or modify it
775	under the same terms as Perl itself.
776
777	=cut
778
779	1; # End of WebPAC::Normalize