--- trunk/lib/WebPAC/Normalize.pm	2005/07/16 20:35:30	10
+++ trunk/lib/WebPAC/Normalize.pm	2005/12/16 14:40:55	260
@@ -2,33 +2,700 @@
 
 use warnings;
 use strict;
+use base 'WebPAC::Common';
+use Data::Dumper;
 
 =head1 NAME
 
-WebPAC::Normalize - normalisation of source file
+WebPAC::Normalize - data mungling for normalisation
 
 =head1 VERSION
 
-Version 0.01
+Version 0.06
 
 =cut
 
-our $VERSION = '0.01';
+our $VERSION = '0.06';
 
 =head1 SYNOPSIS
 
-This package contains code that could be helpful in implementing different
-normalisation front-ends.
+This package contains code that mungle data to produce normalized format.
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+There is one built-in filter called C<regex> which can be use like this:
+
+  filter{regex(s/foo/bar/)}
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
 
 =head1 FUNCTIONS
 
-=head2 none_yet
+=head2 new
+
+Create new normalisation object
+
+  my $n = new WebPAC::Normalize::Something(
+ 	filter => {
+		'filter_name_1' => sub {
+			# filter code
+			return length($_);
+		}, ...
+	},
+	db => $db_obj,
+	lookup_regex => $lookup->regex,
+	lookup => $lookup_obj,
+	prefix => 'foobar',
+  );
+
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
+
+C<prefix> is used to form filename for database record (to support multiple
+source files which are joined in one database).
+
+Recommended parametar C<lookup_regex> is used to enable parsing of lookups
+in structures. If you pass this parametar, you must also pass C<lookup>
+which is C<WebPAC::Lookup> object.
+
+=cut
+
+sub new {
+	my $class = shift;
+        my $self = {@_};
+        bless($self, $class);
+
+	my $r = $self->{'lookup_regex'} ? 1 : 0;
+	my $l = $self->{'lookup'} ? 1 : 0;
+
+	my $log = $self->_get_logger();
+
+	# those two must be in pair
+	if ( ($r & $l) != ($r || $l) ) {
+		my $log = $self->_get_logger();
+		$log->logdie("lookup_regex and lookup must be in pair");
+	}
+
+	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
+
+	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
+
+	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
+
+	if ($self->{filter} && ! $self->{filter}->{regex}) {
+		$log->debug("adding built-in filter regex");
+		$self->{filter}->{regex} = sub {
+			my ($val, $regex) = @_;
+			eval "\$val =~ $regex";
+			return $val;
+		};
+	}
+
+	$self ? return $self : return undef;
+}
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
+
+ my $ds = $webpac->data_structure($rec);
+
+=cut
+
+sub data_structure {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my $rec = shift;
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	$log->debug("data_structure rec = ", sub { Dumper($rec) });
+
+	$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
+
+	my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
+
+	my $cache_file;
+
+	if ($self->{'db'}) {
+		my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
+		$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
+		return $ds if ($ds);
+		$log->debug("cache miss, creating");
+	}
+
+	undef $self->{'currnet_filename'};
+	undef $self->{'headline'};
+
+	my @sorted_tags;
+	if ($self->{tags_by_order}) {
+		@sorted_tags = @{$self->{tags_by_order}};
+	} else {
+		@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+		$self->{tags_by_order} = \@sorted_tags;
+	}
+
+	my $ds;
+
+	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
+
+	foreach my $field (@sorted_tags) {
+
+		my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+		foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+			my $format;
+
+			$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
+			$format = $tag->{'value'} || $tag->{'content'};
+
+			$log->debug("format: $format");
+
+			my @v;
+			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
+				@v = $self->fill_in_to_arr($rec,$format);
+			} else {
+				@v = $self->parse_to_arr($rec,$format);
+			}
+			next if (! @v);
+
+			if ($tag->{'sort'}) {
+				@v = $self->sort_arr(@v);
+			}
+
+			# use format?
+			if ($tag->{'format_name'}) {
+				@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+			}
+
+			# delimiter will join repeatable fields
+			if ($tag->{'delimiter'}) {
+				@v = ( join($tag->{'delimiter'}, @v) );
+			}
+
+			# default types 
+			my @types = qw(display search);
+			# override by type attribute
+			@types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+			foreach my $type (@types) {
+				# append to previous line?
+				$log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
+				if ($tag->{'append'}) {
+
+					# I will delimit appended part with
+					# delimiter (or ,)
+					my $d = $tag->{'delimiter'};
+					# default delimiter
+					$d ||= " ";
+
+					my $last = pop @{$row->{$type}};
+					$d = "" if (! $last);
+					$last .= $d . join($d, @v);
+					push @{$row->{$type}}, $last;
+
+				} else {
+					push @{$row->{$type}}, @v;
+				}
+			}
+
+
+		}
+
+		if ($row) {
+			$row->{'tag'} = $field;
+
+			# TODO: name_sigular, name_plural
+			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+			my $row_name = $name ? $self->_x($name) : $field;
+
+			# post-sort all values in field
+			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+				$log->warn("sort at field tag not implemented");
+			}
+
+			$ds->{$row_name} = $row;
+
+			$log->debug("row $field: ",sub { Dumper($row) });
+		}
+
+	}
+
+	$self->{'db'}->save_ds(
+		id => $id,
+		ds => $ds,
+		prefix => $self->{prefix},
+	) if ($self->{'db'});
+
+	$log->debug("ds: ", sub { Dumper($ds) });
+
+	$log->logconfess("data structure returned is not array any more!") if wantarray;
+
+	return $ds;
+
+}
+
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C<eval{...}> and
+return output or nothing depending on eval code.
+
+ my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+Filters are implemented here. While simple form of filters looks like this:
+
+  filter{name_of_filter}
+
+but, filters can also have variable number of parametars like this:
+
+  filter{name_of_filter(param,param,param)}
+
+=cut
+
+sub parse {
+	my $self = shift;
+
+	my ($rec, $format_utf8, $i) = @_;
+
+	return if (! $format_utf8);
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	$i = 0 if (! $i);
+
+	my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+	my @out;
+
+	$log->debug("format: $format");
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	my $prefix;
+	my $all_found=0;
+
+	while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
+
+		my $del = $1 || '';
+		$prefix ||= $del if ($all_found == 0);
+
+		# repeatable index
+		my $r = $i;
+		$r = 0 if (lc("$2") eq 's');
+
+		my $found = 0;
+		my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
+
+		if ($found) {
+			push @out, $del;
+			push @out, $tmp;
+			$all_found += $found;
+		}
+	}
+
+	return if (! $all_found);
+
+	my $out = join('',@out);
+
+	if ($out) {
+		# add rest of format (suffix)
+		$out .= $format;
+
+		# add prefix if not there
+		$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+		$log->debug("result: $out");
+	}
+
+	if ($eval_code) {
+		my $eval = $self->fill_in($rec,$eval_code,$i) || return;
+		$log->debug("about to eval{$eval} format: $out");
+		return if (! $self->_eval($eval));
+	}
+	
+	if ($filter_name) {
+		my @filter_args;
+		if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
+			@filter_args = split(/,/, $2);
+		}
+		if ($self->{'filter'}->{$filter_name}) {
+			$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
+			unshift @filter_args, $out;
+			$out = $self->{'filter'}->{$filter_name}->(@filter_args);
+			return unless(defined($out));
+			$log->debug("filter result: $out");
+		} else {
+			$log->warn("trying to use undefined filter $filter_name");
+		}
+	}
+
+	return $out;
+}
+
+=head2 parse_to_arr
+
+Similar to C<parse>, but returns array of all repeatable fields
+
+ my @arr = $webpac->parse_to_arr($rec,'v250^a');
 
 =cut
 
-sub none_yet {
+sub parse_to_arr {
+	my $self = shift;
+
+	my ($rec, $format_utf8) = @_;
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+	return if (! $format_utf8);
+
+	my $i = 0;
+	my @arr;
+
+	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
+		push @arr, $v;
+	}
+
+	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+	return @arr;
 }
 
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+=cut
+
+sub fill_in {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my $rec = shift || $log->logconfess("need data record");
+	my $format = shift || $log->logconfess("need format to parse");
+	# iteration (for repeatable fields)
+	my $i = shift || 0;
+
+	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+	# FIXME remove for speedup?
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	if (utf8::is_utf8($format)) {
+		$format = $self->_x($format);
+	}
+
+	my $found = 0;
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	# do actual replacement of placeholders
+	# repeatable fields
+	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+	# non-repeatable fields
+	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
+
+	if ($found) {
+		$log->debug("format: $format");
+		if ($eval_code) {
+			my $eval = $self->fill_in($rec,$eval_code,$i);
+			return if (! $self->_eval($eval));
+		}
+		if ($filter_name && $self->{'filter'}->{$filter_name}) {
+			$log->debug("filter '$filter_name' for $format");
+			$format = $self->{'filter'}->{$filter_name}->($format);
+			return unless(defined($format));
+			$log->debug("filter result: $format");
+		}
+		# do we have lookups?
+		if ($self->{'lookup'}) {
+			if ($self->{'lookup'}->can('lookup')) {
+				my @lookup = $self->{lookup}->lookup($format);
+				$log->debug("lookup $format", join(", ", @lookup));
+				return @lookup;
+			} else {
+				$log->warn("Have lookup object but can't invoke lookup method");
+			}
+		} else {
+			return $format;
+		}
+	} else {
+		return;
+	}
+}
+
+
+=head2 fill_in_to_arr
+
+Similar to C<fill_in>, but returns array of all repeatable fields. Usable
+for fields which have lookups, so they shouldn't be parsed but rather
+C<fill_id>ed.
+
+ my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
+
+=cut
+
+sub fill_in_to_arr {
+	my $self = shift;
+
+	my ($rec, $format_utf8) = @_;
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+	return if (! $format_utf8);
+
+	my $i = 0;
+	my @arr;
+
+	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
+		push @arr, @v;
+	}
+
+	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+	return @arr;
+}
+
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
+
+Arguments are:
+record reference C<$rec>,
+field C<$f>,
+optional subfiled C<$sf>,
+index for repeatable values C<$i>.
+
+Optinal variable C<$found> will be incremeted if there
+is field.
+
+Returns value or empty string.
+
+=cut
+
+sub get_data {
+	my $self = shift;
+
+	my ($rec,$f,$sf,$i,$found) = @_;
+
+	if ($$rec->{$f}) {
+		return '' if (! $$rec->{$f}->[$i]);
+		no strict 'refs';
+		if ($sf && $$rec->{$f}->[$i]->{$sf}) {
+			$$found++ if (defined($$found));
+			return $$rec->{$f}->[$i]->{$sf};
+		} elsif (! $sf && $$rec->{$f}->[$i]) {
+			$$found++ if (defined($$found));
+			# it still might have subfield, just
+			# not specified, so we'll dump all
+			if ($$rec->{$f}->[$i] =~ /HASH/o) {
+				my $out;
+				foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+					$out .= $$rec->{$f}->[$i]->{$k}." ";
+				}
+				return $out;
+			} else {
+				return $$rec->{$f}->[$i];
+			}
+		} else {
+			return '';
+		}
+	} else {
+		return '';
+	}
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+	my $self = shift;
+
+	my ($name,$delimiter,$data) = @_;
+
+	my $log = $self->_get_logger();
+
+	if (! $self->{'import_xml'}->{'format'}->{$name}) {
+		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+		return $data;
+	}
+
+	$log->warn("no delimiter for format $name") if (! $delimiter);
+
+	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+	my @data = split(/\Q$delimiter\E/, $data);
+
+	my $out = sprintf($format, @data);
+	$log->debug("using format $name [$format] on $data to produce: $out");
+
+	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+		return $self->{'lookup'}->lookup($out);
+	} else {
+		return $out;
+	}
+
+}
+
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	# FIXME add Schwartzian Transformation?
+
+	my @sorted = sort {
+		$a =~ s#<[^>]+/*>##;
+		$b =~ s#<[^>]+/*>##;
+		lc($b) cmp lc($a)
+	} @_;
+	$log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+	return @sorted;
+}
+
+
+=head1 INTERNAL METHODS
+
+=head2 _sort_by_order
+
+Sort xml tags data structure accoding to C<order=""> attribute.
+
+=cut
+
+sub _sort_by_order {
+	my $self = shift;
+
+	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+		$self->{'import_xml'}->{'indexer'}->{$a};
+	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+		$self->{'import_xml'}->{'indexer'}->{$b};
+
+	return $va <=> $vb;
+}
+
+=head2 _x
+
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
+
+ my $text = $n->_x('normalize text string');
+
+This is a stub so that other modules doesn't have to implement it.
+
+=cut
+
+sub _x {
+	my $self = shift;
+	return shift;
+}
+
+
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
@@ -42,4 +709,4 @@
 
 =cut
 
-1; # End of WebPAC::DB
+1; # End of WebPAC::Normalize