lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds

        tag search display
        marc21

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dumper;
use Encode qw/from_to/;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.06

=cut

our $VERSION = '0.06';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc21>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds( %{ $arg } );
        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;
my $marc21;
my $marc_encoding;

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        $marc21 = undef;
        $marc_encoding = $a->{marc_encoding};
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc21_fields

Get all fields defined by calls to C<marc21>

        $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );


We are using I<magic> which detect repeatable fields only from
sequence of field/subfield data generated by normalization.

Repeatable field is created if there is second occurence of same subfield or
if any of indicators are different. This is sane for most cases except for
non-repeatable fields with repeatable subfields.

B<TODO>: implement exceptions to magic

=cut

sub _get_marc21_fields {
        my @m;
        my $last;
        foreach my $row (@{ $marc21 }) {
                if ($last &&
                        $last->[0] eq $row->[0] &&              # check if field is same
                        $last->[1] eq $row->[1] &&              # check for i1
                        $last->[2] eq $row->[2] &&              # and for i2
                        $last->[3] ne $row->[3]                 # and subfield is different
                ) {
                        push @$last, ( $row->[3] , $row->[4] );
                        warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
                        next;
                } elsif ($last) {
                        push @m, $last;
                }

                $last = $row;
        }

        push @m, $last if ($last);

        return @m;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc21

Save value for MARC field

  marc21('900','a', rec('200','a') );

=cut

sub marc21 {
        my $f = shift or die "marc21 needs field";
        die "marc21 field must be numer" unless ($f =~ /^\d+$/);

        my $sf = shift or die "marc21 needs subfield";

        foreach (@_) {
                my $v = $_;             # make var read-write for Encode
                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
        }
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        return unless (defined($rec) && defined($rec->{$f}));
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        if ($#_ == 0) {
                return rec1(@_);
        } elsif ($#_ == 1) {
                return rec2(@_);
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n",Dumper(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        return join($d, grep { defined($_) && $_ ne '' } @_);
}

# END
1;
1	dpavlin	10	package WebPAC::Normalize;
2	dpavlin	536	use Exporter 'import';
3			@EXPORT = qw/
4	dpavlin	538	_set_rec _set_lookup
5			_get_ds _clean_ds
6
7	dpavlin	536	tag search display
8	dpavlin	540	marc21
9
10	dpavlin	536	rec1 rec2 rec
11			regex prefix suffix surround
12			first lookup join_with
13			/;
14	dpavlin	10
15			use warnings;
16			use strict;
17	dpavlin	536
18			#use base qw/WebPAC::Common/;
19	dpavlin	13	use Data::Dumper;
20	dpavlin	541	use Encode qw/from_to/;
21	dpavlin	10
22			=head1 NAME
23
24	dpavlin	536	WebPAC::Normalize - describe normalisaton rules using sets
25	dpavlin	10
26			=head1 VERSION
27
28	dpavlin	541	Version 0.06
29	dpavlin	10
30			=cut
31
32	dpavlin	541	our $VERSION = '0.06';
33	dpavlin	10
34			=head1 SYNOPSIS
35
36	dpavlin	536	This module uses C<conf/normalize/*.pl> files to perform normalisation
37			from input records using perl functions which are specialized for set
38			processing.
39	dpavlin	10
40	dpavlin	536	Sets are implemented as arrays, and normalisation file is valid perl, which
41			means that you check it's validity before running WebPAC using
42			C<perl -c normalize.pl>.
43	dpavlin	15
44	dpavlin	536	Normalisation can generate multiple output normalized data. For now, supported output
45	dpavlin	540	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46			C<marc21>.
47	dpavlin	15
48	dpavlin	10	=head1 FUNCTIONS
49
50	dpavlin	538	Functions which start with C<_> are private and used by WebPAC internally.
51			All other functions are available for use within normalisation rules.
52
53	dpavlin	536	=head2 data_structure
54	dpavlin	10
55	dpavlin	536	Return data structure
56	dpavlin	13
57	dpavlin	538	my $ds = WebPAC::Normalize::data_structure(
58	dpavlin	536	lookup => $lookup->lookup_hash,
59			row => $row,
60			rules => $normalize_pl_config,
61	dpavlin	541	marc_encoding => 'utf-8',
62	dpavlin	13	);
63
64	dpavlin	540	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65			other are optional.
66
67	dpavlin	536	This function will B<die> if normalizastion can't be evaled.
68	dpavlin	15
69	dpavlin	538	Since this function isn't exported you have to call it with
70			C<WebPAC::Normalize::data_structure>.
71
72	dpavlin	10	=cut
73
74	dpavlin	536	sub data_structure {
75			my $arg = {@_};
76	dpavlin	13
77	dpavlin	536	die "need row argument" unless ($arg->{row});
78			die "need normalisation argument" unless ($arg->{rules});
79	dpavlin	31
80	dpavlin	536	no strict 'subs';
81	dpavlin	538	_set_lookup( $arg->{lookup} );
82			_set_rec( $arg->{row} );
83	dpavlin	541	_clean_ds( %{ $arg } );
84	dpavlin	536	eval "$arg->{rules}";
85			die "error evaling $arg->{rules}: $@\n" if ($@);
86	dpavlin	540
87	dpavlin	538	return _get_ds();
88	dpavlin	10	}
89
90	dpavlin	538	=head2 _set_rec
91	dpavlin	13
92	dpavlin	536	Set current record hash
93	dpavlin	433
94	dpavlin	538	_set_rec( $rec );
95	dpavlin	433
96			=cut
97
98	dpavlin	536	my $rec;
99	dpavlin	433
100	dpavlin	538	sub _set_rec {
101	dpavlin	536	$rec = shift or die "no record hash";
102	dpavlin	433	}
103
104	dpavlin	538	=head2 _get_ds
105
106			Return hash formatted as data structure
107
108			my $ds = _get_ds();
109
110			=cut
111
112			my $out;
113	dpavlin	540	my $marc21;
114	dpavlin	541	my $marc_encoding;
115	dpavlin	538
116			sub _get_ds {
117			return $out;
118			}
119
120			=head2 _clean_ds
121
122			Clean data structure hash for next record
123
124			_clean_ds();
125
126			=cut
127
128			sub _clean_ds {
129	dpavlin	541	my $a = {@_};
130	dpavlin	538	$out = undef;
131	dpavlin	540	$marc21 = undef;
132	dpavlin	541	$marc_encoding = $a->{marc_encoding};
133	dpavlin	538	}
134
135			=head2 _set_lookup
136
137			Set current lookup hash
138
139			_set_lookup( $lookup );
140
141			=cut
142
143			my $lookup;
144
145			sub _set_lookup {
146			$lookup = shift;
147			}
148
149	dpavlin	540	=head2 _get_marc21_fields
150
151			Get all fields defined by calls to C<marc21>
152
153			$marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154
155	dpavlin	542
156	dpavlin	543
157			We are using I<magic> which detect repeatable fields only from
158			sequence of field/subfield data generated by normalization.
159
160			Repeatable field is created if there is second occurence of same subfield or
161			if any of indicators are different. This is sane for most cases except for
162			non-repeatable fields with repeatable subfields.
163
164			B<TODO>: implement exceptions to magic
165
166	dpavlin	540	=cut
167
168			sub _get_marc21_fields {
169	dpavlin	542	my @m;
170			my $last;
171			foreach my $row (@{ $marc21 }) {
172	dpavlin	543	if ($last &&
173			$last->[0] eq $row->[0] && # check if field is same
174			$last->[1] eq $row->[1] && # check for i1
175	dpavlin	544	$last->[2] eq $row->[2] && # and for i2
176			$last->[3] ne $row->[3] # and subfield is different
177	dpavlin	543	) {
178			push @$last, ( $row->[3] , $row->[4] );
179			warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
180	dpavlin	542	next;
181			} elsif ($last) {
182			push @m, $last;
183			}
184
185	dpavlin	543	$last = $row;
186	dpavlin	542	}
187
188			push @m, $last if ($last);
189
190			return @m;
191	dpavlin	540	}
192
193			=head1 Functions to create C<data_structure>
194
195			Those functions generally have to first in your normalization file.
196
197	dpavlin	536	=head2 tag
198	dpavlin	433
199	dpavlin	536	Define new tag for I<search> and I<display>.
200	dpavlin	433
201	dpavlin	536	tag('Title', rec('200','a') );
202	dpavlin	13
203
204			=cut
205
206	dpavlin	536	sub tag {
207			my $name = shift or die "tag needs name as first argument";
208			my @o = grep { defined($_) && $_ ne '' } @_;
209			return unless (@o);
210			$out->{$name}->{tag} = $name;
211			$out->{$name}->{search} = \@o;
212			$out->{$name}->{display} = \@o;
213			}
214	dpavlin	13
215	dpavlin	536	=head2 display
216	dpavlin	13
217	dpavlin	536	Define tag just for I<display>
218	dpavlin	125
219	dpavlin	536	@v = display('Title', rec('200','a') );
220	dpavlin	125
221	dpavlin	536	=cut
222	dpavlin	125
223	dpavlin	536	sub display {
224			my $name = shift or die "display needs name as first argument";
225			my @o = grep { defined($_) && $_ ne '' } @_;
226			return unless (@o);
227			$out->{$name}->{tag} = $name;
228			$out->{$name}->{display} = \@o;
229			}
230	dpavlin	13
231	dpavlin	536	=head2 search
232	dpavlin	13
233	dpavlin	536	Prepare values just for I<search>
234	dpavlin	13
235	dpavlin	536	@v = search('Title', rec('200','a') );
236	dpavlin	433
237	dpavlin	536	=cut
238	dpavlin	13
239	dpavlin	536	sub search {
240			my $name = shift or die "search needs name as first argument";
241			my @o = grep { defined($_) && $_ ne '' } @_;
242			return unless (@o);
243			$out->{$name}->{tag} = $name;
244			$out->{$name}->{search} = \@o;
245	dpavlin	13	}
246
247	dpavlin	540	=head2 marc21
248
249			Save value for MARC field
250
251			marc21('900','a', rec('200','a') );
252
253			=cut
254
255			sub marc21 {
256			my $f = shift or die "marc21 needs field";
257			die "marc21 field must be numer" unless ($f =~ /^\d+$/);
258
259			my $sf = shift or die "marc21 needs subfield";
260
261	dpavlin	541	foreach (@_) {
262			my $v = $_; # make var read-write for Encode
263	dpavlin	543	next unless (defined($v) && $v !~ /^\s*$/);
264	dpavlin	541	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
265	dpavlin	540	push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
266			}
267			}
268
269			=head1 Functions to extract data from input
270
271			This function should be used inside functions to create C<data_structure> described
272			above.
273
274	dpavlin	536	=head2 rec1
275	dpavlin	371
276	dpavlin	536	Return all values in some field
277	dpavlin	371
278	dpavlin	536	@v = rec1('200')
279	dpavlin	15
280	dpavlin	536	TODO: order of values is probably same as in source data, need to investigate that
281	dpavlin	15
282	dpavlin	536	=cut
283	dpavlin	15
284	dpavlin	536	sub rec1 {
285			my $f = shift;
286			return unless (defined($rec) && defined($rec->{$f}));
287			if (ref($rec->{$f}) eq 'ARRAY') {
288			return map {
289			if (ref($_) eq 'HASH') {
290			values %{$_};
291	dpavlin	31	} else {
292	dpavlin	536	$_;
293	dpavlin	31	}
294	dpavlin	536	} @{ $rec->{$f} };
295			} elsif( defined($rec->{$f}) ) {
296			return $rec->{$f};
297	dpavlin	15	}
298			}
299
300	dpavlin	536	=head2 rec2
301	dpavlin	15
302	dpavlin	536	Return all values in specific field and subfield
303	dpavlin	13
304	dpavlin	536	@v = rec2('200','a')
305	dpavlin	13
306			=cut
307
308	dpavlin	536	sub rec2 {
309			my $f = shift;
310			return unless (defined($rec && $rec->{$f}));
311			my $sf = shift;
312			return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
313			}
314	dpavlin	13
315	dpavlin	536	=head2 rec
316	dpavlin	13
317	dpavlin	536	syntaxtic sugar for
318	dpavlin	13
319	dpavlin	536	@v = rec('200')
320			@v = rec('200','a')
321	dpavlin	13
322	dpavlin	536	=cut
323	dpavlin	373
324	dpavlin	536	sub rec {
325			if ($#_ == 0) {
326			return rec1(@_);
327			} elsif ($#_ == 1) {
328			return rec2(@_);
329	dpavlin	13	}
330			}
331
332	dpavlin	536	=head2 regex
333	dpavlin	15
334	dpavlin	536	Apply regex to some or all values
335	dpavlin	15
336	dpavlin	536	@v = regex( 's/foo/bar/g', @v );
337	dpavlin	15
338			=cut
339
340	dpavlin	536	sub regex {
341			my $r = shift;
342			my @out;
343			#warn "r: $r\n",Dumper(\@_);
344			foreach my $t (@_) {
345			next unless ($t);
346			eval "\$t =~ $r";
347			push @out, $t if ($t && $t ne '');
348	dpavlin	368	}
349	dpavlin	536	return @out;
350	dpavlin	15	}
351
352	dpavlin	536	=head2 prefix
353	dpavlin	15
354	dpavlin	536	Prefix all values with a string
355	dpavlin	15
356	dpavlin	536	@v = prefix( 'my_', @v );
357	dpavlin	15
358			=cut
359
360	dpavlin	536	sub prefix {
361			my $p = shift or die "prefix needs string as first argument";
362			return map { $p . $_ } grep { defined($_) } @_;
363			}
364	dpavlin	15
365	dpavlin	536	=head2 suffix
366	dpavlin	15
367	dpavlin	536	suffix all values with a string
368	dpavlin	15
369	dpavlin	536	@v = suffix( '_my', @v );
370	dpavlin	15
371	dpavlin	536	=cut
372	dpavlin	15
373	dpavlin	536	sub suffix {
374			my $s = shift or die "suffix needs string as first argument";
375			return map { $_ . $s } grep { defined($_) } @_;
376	dpavlin	15	}
377
378	dpavlin	536	=head2 surround
379	dpavlin	13
380	dpavlin	536	surround all values with a two strings
381	dpavlin	13
382	dpavlin	536	@v = surround( 'prefix_', '_suffix', @v );
383	dpavlin	13
384			=cut
385
386	dpavlin	536	sub surround {
387			my $p = shift or die "surround need prefix as first argument";
388			my $s = shift or die "surround needs suffix as second argument";
389			return map { $p . $_ . $s } grep { defined($_) } @_;
390	dpavlin	13	}
391
392	dpavlin	536	=head2 first
393	dpavlin	13
394	dpavlin	536	Return first element
395	dpavlin	15
396	dpavlin	536	$v = first( @v );
397	dpavlin	13
398			=cut
399
400	dpavlin	536	sub first {
401			my $r = shift;
402			return $r;
403	dpavlin	13	}
404
405	dpavlin	536	=head2 lookup
406	dpavlin	13
407	dpavlin	536	Consult lookup hashes for some value
408	dpavlin	13
409	dpavlin	536	@v = lookup( $v );
410			@v = lookup( @v );
411	dpavlin	13
412			=cut
413
414	dpavlin	536	sub lookup {
415			my $k = shift or return;
416			return unless (defined($lookup->{$k}));
417			if (ref($lookup->{$k}) eq 'ARRAY') {
418			return @{ $lookup->{$k} };
419			} else {
420			return $lookup->{$k};
421			}
422	dpavlin	13	}
423
424	dpavlin	536	=head2 join_with
425	dpavlin	13
426	dpavlin	536	Joins walues with some delimiter
427	dpavlin	10
428	dpavlin	536	$v = join_with(", ", @v);
429	dpavlin	10
430	dpavlin	536	=cut
431	dpavlin	10
432	dpavlin	536	sub join_with {
433			my $d = shift;
434			return join($d, grep { defined($_) && $_ ne '' } @_);
435			}
436	dpavlin	10
437	dpavlin	536	# END
438			1;