--- trunk/lib/WebPAC/Normalize.pm	2005/07/17 00:04:25	14
+++ trunk/lib/WebPAC/Normalize.pm	2005/11/24 11:47:15	125
@@ -2,25 +2,69 @@
 
 use warnings;
 use strict;
+use base 'WebPAC::Common';
 use Data::Dumper;
-use Storable;
 
 =head1 NAME
 
-WebPAC::Normalize - normalisation of source file
+WebPAC::Normalize - data mungling for normalisation
 
 =head1 VERSION
 
-Version 0.01
+Version 0.02
 
 =cut
 
-our $VERSION = '0.01';
+our $VERSION = '0.02';
 
 =head1 SYNOPSIS
 
-This package contains code that could be helpful in implementing different
-normalisation front-ends.
+This package contains code that mungle data to produce normalized format.
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
 
 =head1 FUNCTIONS
 
@@ -29,15 +73,23 @@
 Create new normalisation object
 
   my $n = new WebPAC::Normalize::Something(
-	cache_data_structure => './cache/ds/',
+ 	filter => {
+		'filter_name_1' => sub {
+			# filter code
+			return length($_);
+		}, ...
+	},
+	db => $db_obj,
 	lookup_regex => $lookup->regex,
+	lookup => $lookup_obj,
   );
 
-Optional parameter C<cache_data_structure> defines path to directory
-in which cache file for C<data_structure> call will be created.
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
 
 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
-in structures.
+in structures. If you pass this parametar, you must also pass C<lookup>
+which is C<WebPAC::Lookup> object.
 
 =cut
 
@@ -46,49 +98,20 @@
         my $self = {@_};
         bless($self, $class);
 
-	$self->setup_cache_dir( $self->{'cache_data_structure'} );
-
-	$self ? return $self : return undef;
-}
-
-=head2 setup_cache_dir
-
-Check if specified cache directory exist, and if not, disable caching.
-
- $setup_cache_dir('./cache/ds/');
-
-If you pass false or zero value to this function, it will disable
-cacheing.
-
-=cut
-
-sub setup_cache_dir {
-	my $self = shift;
-
-	my $dir = shift;
+	my $r = $self->{'lookup_regex'} ? 1 : 0;
+	my $l = $self->{'lookup'} ? 1 : 0;
 
 	my $log = $self->_get_logger();
 
-	if ($dir) {
-		my $msg;
-		if (! -e $dir) {
-			$msg = "doesn't exist";
-		} elsif (! -d $dir) {
-			$msg = "is not directory";
-		} elsif (! -w $dir) {
-			$msg = "not writable";
-		}
-
-		if ($msg) {
-			undef $self->{'cache_data_structure'};
-			$log->warn("cache_data_structure $dir $msg, disabling...");
-		} else {
-			$log->debug("using cache dir $dir");
-		}
-	} else {
-		$log->debug("disabling cache");
-		undef $self->{'cache_data_structure'};
+	# those two must be in pair
+	if ( ($r & $l) != ($r || $l) ) {
+		my $log = $self->_get_logger();
+		$log->logdie("lookup_regex and lookup must be in pair");
 	}
+
+	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
+
+	$self ? return $self : return undef;
 }
 
 
@@ -99,13 +122,7 @@
 
 This structures are used to produce output.
 
- my @ds = $webpac->data_structure($rec);
-
-B<Note: historical oddity follows>
-
-This method will also set C<< $webpac->{'currnet_filename'} >> if there is
-C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
-C<< <headline> >> tag.
+ my $ds = $webpac->data_structure($rec);
 
 =cut
 
@@ -117,37 +134,19 @@
 	my $rec = shift;
 	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 
+	$log->debug("data_structure rec = ", sub { Dumper($rec) });
+
+	$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
+
+	my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
+
 	my $cache_file;
 
-	if (my $cache_path = $self->{'cache_data_structure'}) {
-		my $id = $rec->{'000'};
-		$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
-		unless (defined($id)) {
-			$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
-			undef $self->{'cache_data_structure'};
-		} else {
-			$cache_file = "$cache_path/$id";
-			if (-r $cache_file) {
-				my $ds_ref = retrieve($cache_file);
-				if ($ds_ref) {
-					$log->debug("cache hit: $cache_file");
-					my $ok = 1;
-					foreach my $f (qw(current_filename headline)) {
-						if ($ds_ref->{$f}) {
-							$self->{$f} = $ds_ref->{$f};
-						} else {
-							$ok = 0;
-						}
-					};
-					if ($ok && $ds_ref->{'ds'}) {
-						return @{ $ds_ref->{'ds'} };
-					} else {
-						$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
-						undef $self->{'cache_data_structure'};
-					}
-				}
-			}
-		}
+	if ($self->{'db'}) {
+		my $ds = $self->{'db'}->load_ds( $mfn );
+		$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
+		return $ds if ($ds);
+		$log->debug("cache miss, creating");
 	}
 
 	undef $self->{'currnet_filename'};
@@ -161,7 +160,7 @@
 		$self->{tags_by_order} = \@sorted_tags;
 	}
 
-	my @ds;
+	my $ds;
 
 	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
 
@@ -172,7 +171,10 @@
 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 
 		foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
-			my $format = $tag->{'value'} || $tag->{'content'};
+			my $format;
+
+			$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
+			$format = $tag->{'value'} || $tag->{'content'};
 
 			$log->debug("format: $format");
 
@@ -193,22 +195,13 @@
 				@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 			}
 
-			if ($field eq 'filename') {
-				$self->{'current_filename'} = join('',@v);
-				$log->debug("filename: ",$self->{'current_filename'});
-			} elsif ($field eq 'headline') {
-				$self->{'headline'} .= join('',@v);
-				$log->debug("headline: ",$self->{'headline'});
-				next; # don't return headline in data_structure!
-			}
-
 			# delimiter will join repeatable fields
 			if ($tag->{'delimiter'}) {
 				@v = ( join($tag->{'delimiter'}, @v) );
 			}
 
 			# default types 
-			my @types = qw(display swish);
+			my @types = qw(display search);
 			# override by type attribute
 			@types = ( $tag->{'type'} ) if ($tag->{'type'});
 
@@ -241,70 +234,30 @@
 
 			# TODO: name_sigular, name_plural
 			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
-			$row->{'name'} = $name ? $self->_x($name) : $field;
+			my $row_name = $name ? $self->_x($name) : $field;
 
 			# post-sort all values in field
 			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 				$log->warn("sort at field tag not implemented");
 			}
 
-			push @ds, $row;
+			$ds->{$row_name} = $row;
 
 			$log->debug("row $field: ",sub { Dumper($row) });
 		}
 
 	}
 
-	if ($cache_file) {
-		store {
-			ds => \@ds,
-			current_filename => $self->{'current_filename'},
-			headline => $self->{'headline'},
-		}, $cache_file;
-		$log->debug("created storable cache file $cache_file");
-	}
-
-	return @ds;
-
-}
-
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
-
-Formats can contain C<lookup{...}> if you need them.
-
-=cut
-
-sub apply_format {
-	my $self = shift;
-
-	my ($name,$delimiter,$data) = @_;
-
-	my $log = $self->_get_logger();
-
-	if (! $self->{'import_xml'}->{'format'}->{$name}) {
-		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
-		return $data;
-	}
-
-	$log->warn("no delimiter for format $name") if (! $delimiter);
-
-	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+	$self->{'db'}->save_ds(
+		id => $mfn,
+		ds => $ds,
+	) if ($self->{'db'});
 
-	my @data = split(/\Q$delimiter\E/, $data);
+	$log->debug("ds: ", sub { Dumper($ds) });
 
-	my $out = sprintf($format, @data);
-	$log->debug("using format $name [$format] on $data to produce: $out");
+	$log->logconfess("data structure returned is not array any more!") if wantarray;
 
-	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
-		return $self->lookup($out);
-	} else {
-		return $out;
-	}
+	return $ds;
 
 }
 
@@ -427,6 +380,93 @@
 	return @arr;
 }
 
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+=cut
+
+sub fill_in {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my $rec = shift || $log->logconfess("need data record");
+	my $format = shift || $log->logconfess("need format to parse");
+	# iteration (for repeatable fields)
+	my $i = shift || 0;
+
+	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+	# FIXME remove for speedup?
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	if (utf8::is_utf8($format)) {
+		$format = $self->_x($format);
+	}
+
+	my $found = 0;
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	# do actual replacement of placeholders
+	# repeatable fields
+	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+	# non-repeatable fields
+	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
+
+	if ($found) {
+		$log->debug("format: $format");
+		if ($eval_code) {
+			my $eval = $self->fill_in($rec,$eval_code,$i);
+			return if (! $self->_eval($eval));
+		}
+		if ($filter_name && $self->{'filter'}->{$filter_name}) {
+			$log->debug("filter '$filter_name' for $format");
+			$format = $self->{'filter'}->{$filter_name}->($format);
+			return unless(defined($format));
+			$log->debug("filter result: $format");
+		}
+		# do we have lookups?
+		if ($self->{'lookup'}) {
+			if ($self->{'lookup'}->can('lookup')) {
+				return $self->{'lookup'}->lookup($format);
+			} else {
+				$log->warn("Have lookup object but can't invoke lookup method");
+			}
+		} else {
+			return $format;
+		}
+	} else {
+		return;
+	}
+}
+
+
 =head2 fill_in_to_arr
 
 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
@@ -459,6 +499,99 @@
 	return @arr;
 }
 
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
+
+Arguments are:
+record reference C<$rec>,
+field C<$f>,
+optional subfiled C<$sf>,
+index for repeatable values C<$i>.
+
+Optinal variable C<$found> will be incremeted if there
+is field.
+
+Returns value or empty string.
+
+=cut
+
+sub get_data {
+	my $self = shift;
+
+	my ($rec,$f,$sf,$i,$found) = @_;
+
+	if ($$rec->{$f}) {
+		return '' if (! $$rec->{$f}->[$i]);
+		no strict 'refs';
+		if ($sf && $$rec->{$f}->[$i]->{$sf}) {
+			$$found++ if (defined($$found));
+			return $$rec->{$f}->[$i]->{$sf};
+		} elsif (! $sf && $$rec->{$f}->[$i]) {
+			$$found++ if (defined($$found));
+			# it still might have subfield, just
+			# not specified, so we'll dump all
+			if ($$rec->{$f}->[$i] =~ /HASH/o) {
+				my $out;
+				foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+					$out .= $$rec->{$f}->[$i]->{$k}." ";
+				}
+				return $out;
+			} else {
+				return $$rec->{$f}->[$i];
+			}
+		} else {
+			return '';
+		}
+	} else {
+		return '';
+	}
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+	my $self = shift;
+
+	my ($name,$delimiter,$data) = @_;
+
+	my $log = $self->_get_logger();
+
+	if (! $self->{'import_xml'}->{'format'}->{$name}) {
+		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+		return $data;
+	}
+
+	$log->warn("no delimiter for format $name") if (! $delimiter);
+
+	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+	my @data = split(/\Q$delimiter\E/, $data);
+
+	my $out = sprintf($format, @data);
+	$log->debug("using format $name [$format] on $data to produce: $out");
+
+	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+		return $self->{'lookup'}->lookup($out);
+	} else {
+		return $out;
+	}
+
+}
+
 =head2 sort_arr
 
 Sort array ignoring case and html in data
@@ -485,6 +618,8 @@
 }
 
 
+=head1 INTERNAL METHODS
+
 =head2 _sort_by_order
 
 Sort xml tags data structure accoding to C<order=""> attribute.
@@ -504,8 +639,9 @@
 
 =head2 _x
 
-Convert strings from C<conf/normalize> encoding into application specific
-(optinally specified using C<code_page> to C<new> constructor.
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
 
  my $text = $n->_x('normalize text string');