--- trunk/lib/WebPAC/Normalize.pm	2006/06/26 16:39:51	536
+++ trunk/lib/WebPAC/Normalize.pm	2006/06/29 23:19:26	547
@@ -1,9 +1,12 @@
 package WebPAC::Normalize;
 use Exporter 'import';
 @EXPORT = qw/
-	set_rec set_lookup
-	get_ds clean_ds
+	_set_rec _set_lookup
+	_get_ds _clean_ds
+
 	tag search display
+	marc marc_indicators marc_repeatable_subfield
+
 	rec1 rec2 rec
 	regex prefix suffix surround
 	first lookup join_with
@@ -14,6 +17,7 @@
 
 #use base qw/WebPAC::Common/;
 use Data::Dumper;
+use Encode qw/from_to/;
 
 =head1 NAME
 
@@ -21,11 +25,11 @@
 
 =head1 VERSION
 
-Version 0.04
+Version 0.06
 
 =cut
 
-our $VERSION = '0.04';
+our $VERSION = '0.06';
 
 =head1 SYNOPSIS
 
@@ -38,22 +42,33 @@
 C<perl -c normalize.pl>.
 
 Normalisation can generate multiple output normalized data. For now, supported output
-types (on the left side of definition) are: C<tag>, C<display> and C<search>.
+types (on the left side of definition) are: C<tag>, C<display>, C<search> and
+C<marc>.
 
 =head1 FUNCTIONS
 
+Functions which start with C<_> are private and used by WebPAC internally.
+All other functions are available for use within normalisation rules.
+
 =head2 data_structure
 
 Return data structure
 
-  my $ds = WebPAC::Normalize(
+  my $ds = WebPAC::Normalize::data_structure(
   	lookup => $lookup->lookup_hash,
 	row => $row,
 	rules => $normalize_pl_config,
+	marc_encoding => 'utf-8',
   );
 
+Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
+other are optional.
+
 This function will B<die> if normalizastion can't be evaled.
 
+Since this function isn't exported you have to call it with 
+C<WebPAC::Normalize::data_structure>.
+
 =cut
 
 sub data_structure {
@@ -63,28 +78,122 @@
 	die "need normalisation argument" unless ($arg->{rules});
 
 	no strict 'subs';
-	set_lookup( $arg->{lookup} );
-	set_rec( $arg->{row} );
-	clean_ds();
+	_set_lookup( $arg->{lookup} );
+	_set_rec( $arg->{row} );
+	_clean_ds( %{ $arg } );
 	eval "$arg->{rules}";
 	die "error evaling $arg->{rules}: $@\n" if ($@);
-	return get_ds();
+
+	return _get_ds();
 }
 
-=head2 set_rec
+=head2 _set_rec
 
 Set current record hash
 
-  set_rec( $rec );
+  _set_rec( $rec );
 
 =cut
 
 my $rec;
 
-sub set_rec {
+sub _set_rec {
 	$rec = shift or die "no record hash";
 }
 
+=head2 _get_ds
+
+Return hash formatted as data structure
+
+  my $ds = _get_ds();
+
+=cut
+
+my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
+
+sub _get_ds {
+	return $out;
+}
+
+=head2 _clean_ds
+
+Clean data structure hash for next record
+
+  _clean_ds();
+
+=cut
+
+sub _clean_ds {
+	my $a = {@_};
+	($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
+	$marc_encoding = $a->{marc_encoding};
+}
+
+=head2 _set_lookup
+
+Set current lookup hash
+
+  _set_lookup( $lookup );
+
+=cut
+
+my $lookup;
+
+sub _set_lookup {
+	$lookup = shift;
+}
+
+=head2 _get_marc_fields
+
+Get all fields defined by calls to C<marc>
+
+	$marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
+
+
+
+We are using I<magic> which detect repeatable fields only from
+sequence of field/subfield data generated by normalization.
+
+Repeatable field is created if there is second occurence of same subfield or
+if any of indicators are different. This is sane for most cases except for
+non-repeatable fields with repeatable subfields.
+
+You can change behaviour of that using C<marc_repeatable_subfield>.
+
+=cut
+
+sub _get_marc_fields {
+	my @m;
+	my $last;
+	foreach my $row (@{ $marc_record }) {
+		if ($last &&
+			$last->[0] eq $row->[0] &&		# check if field is same
+			$last->[1] eq $row->[1] &&		# check for i1
+			$last->[2] eq $row->[2]	&&		# and for i2
+				( $last->[3] ne $row->[3] ||				# and subfield is different
+				$last->[3] eq $row->[3] &&					# or subfield is same,
+				$marc_repeatable_subfield->{ $row->[3] }	# but is repeatable
+			)
+		) {
+			push @$last, ( $row->[3] , $row->[4] );
+			warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
+			next;
+		} elsif ($last) {
+			push @m, $last;
+		}
+
+		$last = $row;
+	}
+
+	push @m, $last if ($last);
+
+	return @m;
+}
+
+=head1 Functions to create C<data_structure>
+
+Those functions generally have to first in your normalization file.
+
 =head2 tag
 
 Define new tag for I<search> and I<display>.
@@ -94,8 +203,6 @@
 
 =cut
 
-my $out;
-
 sub tag {
 	my $name = shift or die "tag needs name as first argument";
 	my @o = grep { defined($_) && $_ ne '' } @_;
@@ -137,44 +244,75 @@
 	$out->{$name}->{search} = \@o;
 }
 
-=head2 get_ds
+=head2 marc
 
-Return hash formatted as data structure
+Save value for MARC field
 
-  my $ds = get_ds();
+  marc('900','a', rec('200','a') );
 
 =cut
 
-sub get_ds {
-	return $out;
+sub marc {
+	my $f = shift or die "marc needs field";
+	die "marc field must be numer" unless ($f =~ /^\d+$/);
+
+	my $sf = shift or die "marc needs subfield";
+
+	foreach (@_) {
+		my $v = $_;		# make var read-write for Encode
+		next unless (defined($v) && $v !~ /^\s*$/);
+		from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
+		push @{ $marc_record }, [
+			$f,
+			$marc_indicators->{$f}->{i1} || ' ',
+			$marc_indicators->{$f}->{i2} || ' ',
+			$sf => $v
+		];
+	}
 }
 
-=head2 clean_ds
+=head2 marc_repeatable_subfield
 
-Clean data structure hash for next record
+Save values for MARC repetable subfield
 
-  clean_ds();
+  marc_repeatable_subfield('910', 'z', rec('909') );
 
 =cut
 
-sub clean_ds {
-	$out = undef;
+sub marc_repeatable_subfield {
+	die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
+	$marc_repeatable_subfield->{ $_[1] }++;
+	marc(@_);
 }
 
-=head2 set_lookup
+=head2 marc_indicators
 
-Set current lookup hash
+Set both indicators for MARC field
 
-  set_lookup( $lookup );
+  marc_indicators('900', ' ', 1);
 
-=cut
+Any indicator value other than C<0-9> will be treated as undefined.
 
-my $lookup;
+=cut
 
-sub set_lookup {
-	$lookup = shift;
+sub marc_indicators {
+	my $f = shift || die "marc_indicators need field!\n";
+	my ($i1,$i2) = @_;
+	die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
+	die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
+
+	$i1 = ' ' if ($i1 !~ /^\d$/);
+	$i2 = ' ' if ($i2 !~ /^\d$/);
+	$marc_indicators->{$f}->{i1} = $i1;
+	$marc_indicators->{$f}->{i2} = $i2;
 }
 
+
+=head1 Functions to extract data from input
+
+This function should be used inside functions to create C<data_structure> described
+above.
+
 =head2 rec1
 
 Return all values in some field