--- trunk/lib/WebPAC/Normalize.pm	2005/07/16 20:35:30	10
+++ trunk/lib/WebPAC/Normalize.pm	2005/07/16 23:56:14	13
@@ -2,6 +2,7 @@
 
 use warnings;
 use strict;
+use Data::Dumper;
 
 =head1 NAME
 
@@ -22,13 +23,501 @@
 
 =head1 FUNCTIONS
 
-=head2 none_yet
+=head2 new
+
+Create new normalisation object
+
+  my $n = new WebPAC::Normalize::Something(
+	cache_data_structure => './cache/ds/',
+	lookup_regex => $lookup->regex,
+  );
+
+Optional parameter C<cache_data_structure> defines path to directory
+in which cache file for C<data_structure> call will be created.
+
+Recommended parametar C<lookup_regex> is used to enable parsing of lookups
+in structures.
+
+=cut
+
+sub new {
+	my $class = shift;
+        my $self = {@_};
+        bless($self, $class);
+
+	$self->setup_cache_dir( $self->{'cache_data_structure'} );
+
+	$self ? return $self : return undef;
+}
+
+=head2 setup_cache_dir
+
+Check if specified cache directory exist, and if not, disable caching.
+
+ $setup_cache_dir('./cache/ds/');
+
+If you pass false or zero value to this function, it will disable
+cacheing.
 
 =cut
 
-sub none_yet {
+sub setup_cache_dir {
+	my $self = shift;
+
+	my $dir = shift;
+
+	my $log = $self->_get_logger();
+
+	if ($dir) {
+		my $msg;
+		if (! -e $dir) {
+			$msg = "doesn't exist";
+		} elsif (! -d $dir) {
+			$msg = "is not directory";
+		} elsif (! -w $dir) {
+			$msg = "not writable";
+		}
+
+		if ($msg) {
+			undef $self->{'cache_data_structure'};
+			$log->warn("cache_data_structure $dir $msg, disabling...");
+		} else {
+			$log->debug("using cache dir $dir");
+		}
+	} else {
+		$log->debug("disabling cache");
+		undef $self->{'cache_data_structure'};
+	}
 }
 
+
+=head2 data_structure
+
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
+
+ my @ds = $webpac->data_structure($rec);
+
+B<Note: historical oddity follows>
+
+This method will also set C<< $webpac->{'currnet_filename'} >> if there is
+C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
+C<< <headline> >> tag.
+
+=cut
+
+sub data_structure {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my $rec = shift;
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	my $cache_file;
+
+	if (my $cache_path = $self->{'cache_data_structure'}) {
+		my $id = $rec->{'000'};
+		$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
+		unless (defined($id)) {
+			$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
+			undef $self->{'cache_data_structure'};
+		} else {
+			$cache_file = "$cache_path/$id";
+			if (-r $cache_file) {
+				my $ds_ref = retrieve($cache_file);
+				if ($ds_ref) {
+					$log->debug("cache hit: $cache_file");
+					my $ok = 1;
+					foreach my $f (qw(current_filename headline)) {
+						if ($ds_ref->{$f}) {
+							$self->{$f} = $ds_ref->{$f};
+						} else {
+							$ok = 0;
+						}
+					};
+					if ($ok && $ds_ref->{'ds'}) {
+						return @{ $ds_ref->{'ds'} };
+					} else {
+						$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
+						undef $self->{'cache_data_structure'};
+					}
+				}
+			}
+		}
+	}
+
+	undef $self->{'currnet_filename'};
+	undef $self->{'headline'};
+
+	my @sorted_tags;
+	if ($self->{tags_by_order}) {
+		@sorted_tags = @{$self->{tags_by_order}};
+	} else {
+		@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+		$self->{tags_by_order} = \@sorted_tags;
+	}
+
+	my @ds;
+
+	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
+
+	foreach my $field (@sorted_tags) {
+
+		my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+		foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+			my $format = $tag->{'value'} || $tag->{'content'};
+
+			$log->debug("format: $format");
+
+			my @v;
+			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
+				@v = $self->fill_in_to_arr($rec,$format);
+			} else {
+				@v = $self->parse_to_arr($rec,$format);
+			}
+			next if (! @v);
+
+			if ($tag->{'sort'}) {
+				@v = $self->sort_arr(@v);
+			}
+
+			# use format?
+			if ($tag->{'format_name'}) {
+				@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+			}
+
+			if ($field eq 'filename') {
+				$self->{'current_filename'} = join('',@v);
+				$log->debug("filename: ",$self->{'current_filename'});
+			} elsif ($field eq 'headline') {
+				$self->{'headline'} .= join('',@v);
+				$log->debug("headline: ",$self->{'headline'});
+				next; # don't return headline in data_structure!
+			}
+
+			# delimiter will join repeatable fields
+			if ($tag->{'delimiter'}) {
+				@v = ( join($tag->{'delimiter'}, @v) );
+			}
+
+			# default types 
+			my @types = qw(display swish);
+			# override by type attribute
+			@types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+			foreach my $type (@types) {
+				# append to previous line?
+				$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+				if ($tag->{'append'}) {
+
+					# I will delimit appended part with
+					# delimiter (or ,)
+					my $d = $tag->{'delimiter'};
+					# default delimiter
+					$d ||= " ";
+
+					my $last = pop @{$row->{$type}};
+					$d = "" if (! $last);
+					$last .= $d . join($d, @v);
+					push @{$row->{$type}}, $last;
+
+				} else {
+					push @{$row->{$type}}, @v;
+				}
+			}
+
+
+		}
+
+		if ($row) {
+			$row->{'tag'} = $field;
+
+			# TODO: name_sigular, name_plural
+			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+			$row->{'name'} = $name ? $self->_x($name) : $field;
+
+			# post-sort all values in field
+			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+				$log->warn("sort at field tag not implemented");
+			}
+
+			push @ds, $row;
+
+			$log->debug("row $field: ",sub { Dumper($row) });
+		}
+
+	}
+
+	if ($cache_file) {
+		store {
+			ds => \@ds,
+			current_filename => $self->{'current_filename'},
+			headline => $self->{'headline'},
+		}, $cache_file;
+		$log->debug("created storable cache file $cache_file");
+	}
+
+	return @ds;
+
+}
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+	my $self = shift;
+
+	my ($name,$delimiter,$data) = @_;
+
+	my $log = $self->_get_logger();
+
+	if (! $self->{'import_xml'}->{'format'}->{$name}) {
+		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+		return $data;
+	}
+
+	$log->warn("no delimiter for format $name") if (! $delimiter);
+
+	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+	my @data = split(/\Q$delimiter\E/, $data);
+
+	my $out = sprintf($format, @data);
+	$log->debug("using format $name [$format] on $data to produce: $out");
+
+	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+		return $self->lookup($out);
+	} else {
+		return $out;
+	}
+
+}
+
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C<eval{...}> and
+return output or nothing depending on eval code.
+
+ my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+=cut
+
+sub parse {
+	my $self = shift;
+
+	my ($rec, $format_utf8, $i) = @_;
+
+	return if (! $format_utf8);
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	$i = 0 if (! $i);
+
+	my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+	my @out;
+
+	$log->debug("format: $format");
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	my $prefix;
+	my $all_found=0;
+
+	while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
+
+		my $del = $1 || '';
+		$prefix ||= $del if ($all_found == 0);
+
+		# repeatable index
+		my $r = $i;
+		$r = 0 if (lc("$2") eq 's');
+
+		my $found = 0;
+		my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
+
+		if ($found) {
+			push @out, $del;
+			push @out, $tmp;
+			$all_found += $found;
+		}
+	}
+
+	return if (! $all_found);
+
+	my $out = join('',@out);
+
+	if ($out) {
+		# add rest of format (suffix)
+		$out .= $format;
+
+		# add prefix if not there
+		$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+		$log->debug("result: $out");
+	}
+
+	if ($eval_code) {
+		my $eval = $self->fill_in($rec,$eval_code,$i) || return;
+		$log->debug("about to eval{$eval} format: $out");
+		return if (! $self->_eval($eval));
+	}
+	
+	if ($filter_name && $self->{'filter'}->{$filter_name}) {
+		$log->debug("about to filter{$filter_name} format: $out");
+		$out = $self->{'filter'}->{$filter_name}->($out);
+		return unless(defined($out));
+		$log->debug("filter result: $out");
+	}
+
+	return $out;
+}
+
+=head2 parse_to_arr
+
+Similar to C<parse>, but returns array of all repeatable fields
+
+ my @arr = $webpac->parse_to_arr($rec,'v250^a');
+
+=cut
+
+sub parse_to_arr {
+	my $self = shift;
+
+	my ($rec, $format_utf8) = @_;
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+	return if (! $format_utf8);
+
+	my $i = 0;
+	my @arr;
+
+	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
+		push @arr, $v;
+	}
+
+	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+	return @arr;
+}
+
+=head2 fill_in_to_arr
+
+Similar to C<fill_in>, but returns array of all repeatable fields. Usable
+for fields which have lookups, so they shouldn't be parsed but rather
+C<fill_id>ed.
+
+ my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
+
+=cut
+
+sub fill_in_to_arr {
+	my $self = shift;
+
+	my ($rec, $format_utf8) = @_;
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+	return if (! $format_utf8);
+
+	my $i = 0;
+	my @arr;
+
+	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
+		push @arr, @v;
+	}
+
+	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+	return @arr;
+}
+
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	# FIXME add Schwartzian Transformation?
+
+	my @sorted = sort {
+		$a =~ s#<[^>]+/*>##;
+		$b =~ s#<[^>]+/*>##;
+		lc($b) cmp lc($a)
+	} @_;
+	$log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+	return @sorted;
+}
+
+
+=head2 _sort_by_order
+
+Sort xml tags data structure accoding to C<order=""> attribute.
+
+=cut
+
+sub _sort_by_order {
+	my $self = shift;
+
+	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+		$self->{'import_xml'}->{'indexer'}->{$a};
+	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+		$self->{'import_xml'}->{'indexer'}->{$b};
+
+	return $va <=> $vb;
+}
+
+=head2 _x
+
+Convert strings from C<conf/normalize> encoding into application specific
+(optinally specified using C<code_page> to C<new> constructor.
+
+ my $text = $n->_x('normalize text string');
+
+This is a stub so that other modules doesn't have to implement it.
+
+=cut
+
+sub _x {
+	my $self = shift;
+	return shift;
+}
+
+
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>