--- trunk/lib/WebPAC/Normalize.pm	2005/07/17 00:04:25	14
+++ trunk/lib/WebPAC/Normalize.pm	2005/07/17 10:42:23	15
@@ -7,7 +7,7 @@
 
 =head1 NAME
 
-WebPAC::Normalize - normalisation of source file
+WebPAC::Normalize - data mungling for normalisation
 
 =head1 VERSION
 
@@ -19,8 +19,52 @@
 
 =head1 SYNOPSIS
 
-This package contains code that could be helpful in implementing different
-normalisation front-ends.
+This package contains code that mungle data to produce normalized format.
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
 
 =head1 FUNCTIONS
 
@@ -29,10 +73,19 @@
 Create new normalisation object
 
   my $n = new WebPAC::Normalize::Something(
+ 	filter => {
+		'filter_name_1' => sub {
+			# filter code
+			return length($_);
+		}, ...
+	},
 	cache_data_structure => './cache/ds/',
 	lookup_regex => $lookup->regex,
   );
 
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
+
 Optional parameter C<cache_data_structure> defines path to directory
 in which cache file for C<data_structure> call will be created.
 
@@ -268,46 +321,6 @@
 
 }
 
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
-
-Formats can contain C<lookup{...}> if you need them.
-
-=cut
-
-sub apply_format {
-	my $self = shift;
-
-	my ($name,$delimiter,$data) = @_;
-
-	my $log = $self->_get_logger();
-
-	if (! $self->{'import_xml'}->{'format'}->{$name}) {
-		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
-		return $data;
-	}
-
-	$log->warn("no delimiter for format $name") if (! $delimiter);
-
-	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
-
-	my @data = split(/\Q$delimiter\E/, $data);
-
-	my $out = sprintf($format, @data);
-	$log->debug("using format $name [$format] on $data to produce: $out");
-
-	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
-		return $self->lookup($out);
-	} else {
-		return $out;
-	}
-
-}
-
 =head2 parse
 
 Perform smart parsing of string, skipping delimiters for fields which aren't
@@ -427,6 +440,89 @@
 	return @arr;
 }
 
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+=cut
+
+sub fill_in {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my $rec = shift || $log->logconfess("need data record");
+	my $format = shift || $log->logconfess("need format to parse");
+	# iteration (for repeatable fields)
+	my $i = shift || 0;
+
+	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+	# FIXME remove for speedup?
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	if (utf8::is_utf8($format)) {
+		$format = $self->_x($format);
+	}
+
+	my $found = 0;
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	# do actual replacement of placeholders
+	# repeatable fields
+	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+	# non-repeatable fields
+	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
+
+	if ($found) {
+		$log->debug("format: $format");
+		if ($eval_code) {
+			my $eval = $self->fill_in($rec,$eval_code,$i);
+			return if (! $self->_eval($eval));
+		}
+		if ($filter_name && $self->{'filter'}->{$filter_name}) {
+			$log->debug("filter '$filter_name' for $format");
+			$format = $self->{'filter'}->{$filter_name}->($format);
+			return unless(defined($format));
+			$log->debug("filter result: $format");
+		}
+		# do we have lookups?
+		if ($self->{'lookup'}) {
+			return $self->lookup($format);
+		} else {
+			return $format;
+		}
+	} else {
+		return;
+	}
+}
+
+
 =head2 fill_in_to_arr
 
 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
@@ -459,6 +555,97 @@
 	return @arr;
 }
 
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
+
+Arguments are:
+record reference C<$rec>,
+field C<$f>,
+optional subfiled C<$sf>,
+index for repeatable values C<$i>.
+
+Optinal variable C<$found> will be incremeted if there
+is field.
+
+Returns value or empty string.
+
+=cut
+
+sub get_data {
+	my $self = shift;
+
+	my ($rec,$f,$sf,$i,$found) = @_;
+
+	if ($$rec->{$f}) {
+		return '' if (! $$rec->{$f}->[$i]);
+		no strict 'refs';
+		if ($sf && $$rec->{$f}->[$i]->{$sf}) {
+			$$found++ if (defined($$found));
+			return $$rec->{$f}->[$i]->{$sf};
+		} elsif ($$rec->{$f}->[$i]) {
+			$$found++ if (defined($$found));
+			# it still might have subfield, just
+			# not specified, so we'll dump all
+			if ($$rec->{$f}->[$i] =~ /HASH/o) {
+				my $out;
+				foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+					$out .= $$rec->{$f}->[$i]->{$k}." ";
+				}
+				return $out;
+			} else {
+				return $$rec->{$f}->[$i];
+			}
+		}
+	} else {
+		return '';
+	}
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+	my $self = shift;
+
+	my ($name,$delimiter,$data) = @_;
+
+	my $log = $self->_get_logger();
+
+	if (! $self->{'import_xml'}->{'format'}->{$name}) {
+		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+		return $data;
+	}
+
+	$log->warn("no delimiter for format $name") if (! $delimiter);
+
+	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+	my @data = split(/\Q$delimiter\E/, $data);
+
+	my $out = sprintf($format, @data);
+	$log->debug("using format $name [$format] on $data to produce: $out");
+
+	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+		return $self->lookup($out);
+	} else {
+		return $out;
+	}
+
+}
+
 =head2 sort_arr
 
 Sort array ignoring case and html in data
@@ -485,6 +672,8 @@
 }
 
 
+=head1 INTERNAL METHODS
+
 =head2 _sort_by_order
 
 Sort xml tags data structure accoding to C<order=""> attribute.
@@ -504,8 +693,9 @@
 
 =head2 _x
 
-Convert strings from C<conf/normalize> encoding into application specific
-(optinally specified using C<code_page> to C<new> constructor.
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
 
  my $text = $n->_x('normalize text string');