lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds

        tag search display
        marc marc_indicators marc_repeatable_subfield

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dumper;
use Encode qw/from_to/;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.06

=cut

our $VERSION = '0.06';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds( %{ $arg } );
        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
        $marc_encoding = $a->{marc_encoding};
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc_fields

Get all fields defined by calls to C<marc>

        $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );


We are using I<magic> which detect repeatable fields only from
sequence of field/subfield data generated by normalization.

Repeatable field is created if there is second occurence of same subfield or
if any of indicators are different. This is sane for most cases except for
non-repeatable fields with repeatable subfields.

You can change behaviour of that using C<marc_repeatable_subfield>.

=cut

sub _get_marc_fields {
        my @m;
        my $last;
        foreach my $row (@{ $marc_record }) {
                if ($last &&
                        $last->[0] eq $row->[0] &&              # check if field is same
                        $last->[1] eq $row->[1] &&              # check for i1
                        $last->[2] eq $row->[2] &&              # and for i2
                                ( $last->[3] ne $row->[3] ||                            # and subfield is different
                                $last->[3] eq $row->[3] &&                                      # or subfield is same,
                                $marc_repeatable_subfield->{ $row->[3] }        # but is repeatable
                        )
                ) {
                        push @$last, ( $row->[3] , $row->[4] );
                        next;
                } elsif ($last) {
                        push @m, $last;
                }

                $last = $row;
        }

        push @m, $last if ($last);

        return @m;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc

Save value for MARC field

  marc('900','a', rec('200','a') );

=cut

sub marc {
        my $f = shift or die "marc needs field";
        die "marc field must be numer" unless ($f =~ /^\d+$/);

        my $sf = shift or die "marc needs subfield";

        foreach (@_) {
                my $v = $_;             # make var read-write for Encode
                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
                push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
        }
}

=head2 marc_repeatable_subfield

Save values for MARC repetable subfield

  marc_repeatable_subfield('910', 'z', rec('909') );

=cut

sub marc_repeatable_subfield {
        die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
        $marc_repeatable_subfield->{ $_[1] }++;
        marc(@_);
}

=head2 marc_indicators

Set both indicators for MARC field

  marc_indicators('900', ' ', 1);

Any indicator value other than C<0-9> will be treated as undefined.

=cut

sub marc_indicators {
        my $f = shift || die "marc_indicators need field!\n";
        my ($i1,$i2) = @_;
        die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
        die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));

        $i1 = ' ' if ($i1 !~ /^\d$/);
        $i2 = ' ' if ($i2 !~ /^\d$/);
        @{ $marc_indicators->{$f} } = ($i1,$i2);
}


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        return unless (defined($rec) && defined($rec->{$f}));
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        if ($#_ == 0) {
                return rec1(@_);
        } elsif ($#_ == 1) {
                return rec2(@_);
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n",Dumper(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        return join($d, grep { defined($_) && $_ ne '' } @_);
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	@EXPORT = qw/
4	_set_rec _set_lookup
5	_get_ds _clean_ds
6
7	tag search display
8	marc marc_indicators marc_repeatable_subfield
9
10	rec1 rec2 rec
11	regex prefix suffix surround
12	first lookup join_with
13	/;
14
15	use warnings;
16	use strict;
17
18	#use base qw/WebPAC::Common/;
19	use Data::Dumper;
20	use Encode qw/from_to/;
21
22	=head1 NAME
23
24	WebPAC::Normalize - describe normalisaton rules using sets
25
26	=head1 VERSION
27
28	Version 0.06
29
30	=cut
31
32	our $VERSION = '0.06';
33
34	=head1 SYNOPSIS
35
36	This module uses C<conf/normalize/*.pl> files to perform normalisation
37	from input records using perl functions which are specialized for set
38	processing.
39
40	Sets are implemented as arrays, and normalisation file is valid perl, which
41	means that you check it's validity before running WebPAC using
42	C<perl -c normalize.pl>.
43
44	Normalisation can generate multiple output normalized data. For now, supported output
45	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46	C<marc>.
47
48	=head1 FUNCTIONS
49
50	Functions which start with C<_> are private and used by WebPAC internally.
51	All other functions are available for use within normalisation rules.
52
53	=head2 data_structure
54
55	Return data structure
56
57	my $ds = WebPAC::Normalize::data_structure(
58	lookup => $lookup->lookup_hash,
59	row => $row,
60	rules => $normalize_pl_config,
61	marc_encoding => 'utf-8',
62	);
63
64	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65	other are optional.
66
67	This function will B<die> if normalizastion can't be evaled.
68
69	Since this function isn't exported you have to call it with
70	C<WebPAC::Normalize::data_structure>.
71
72	=cut
73
74	sub data_structure {
75	my $arg = {@_};
76
77	die "need row argument" unless ($arg->{row});
78	die "need normalisation argument" unless ($arg->{rules});
79
80	no strict 'subs';
81	_set_lookup( $arg->{lookup} );
82	_set_rec( $arg->{row} );
83	_clean_ds( %{ $arg } );
84	eval "$arg->{rules}";
85	die "error evaling $arg->{rules}: $@\n" if ($@);
86
87	return _get_ds();
88	}
89
90	=head2 _set_rec
91
92	Set current record hash
93
94	_set_rec( $rec );
95
96	=cut
97
98	my $rec;
99
100	sub _set_rec {
101	$rec = shift or die "no record hash";
102	}
103
104	=head2 _get_ds
105
106	Return hash formatted as data structure
107
108	my $ds = _get_ds();
109
110	=cut
111
112	my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
113
114	sub _get_ds {
115	return $out;
116	}
117
118	=head2 _clean_ds
119
120	Clean data structure hash for next record
121
122	_clean_ds();
123
124	=cut
125
126	sub _clean_ds {
127	my $a = {@_};
128	($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
129	$marc_encoding = $a->{marc_encoding};
130	}
131
132	=head2 _set_lookup
133
134	Set current lookup hash
135
136	_set_lookup( $lookup );
137
138	=cut
139
140	my $lookup;
141
142	sub _set_lookup {
143	$lookup = shift;
144	}
145
146	=head2 _get_marc_fields
147
148	Get all fields defined by calls to C<marc>
149
150	$marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
151
152
153
154	We are using I<magic> which detect repeatable fields only from
155	sequence of field/subfield data generated by normalization.
156
157	Repeatable field is created if there is second occurence of same subfield or
158	if any of indicators are different. This is sane for most cases except for
159	non-repeatable fields with repeatable subfields.
160
161	You can change behaviour of that using C<marc_repeatable_subfield>.
162
163	=cut
164
165	sub _get_marc_fields {
166	my @m;
167	my $last;
168	foreach my $row (@{ $marc_record }) {
169	if ($last &&
170	$last->[0] eq $row->[0] && # check if field is same
171	$last->[1] eq $row->[1] && # check for i1
172	$last->[2] eq $row->[2] && # and for i2
173	( $last->[3] ne $row->[3] \|\| # and subfield is different
174	$last->[3] eq $row->[3] && # or subfield is same,
175	$marc_repeatable_subfield->{ $row->[3] } # but is repeatable
176	)
177	) {
178	push @$last, ( $row->[3] , $row->[4] );
179	next;
180	} elsif ($last) {
181	push @m, $last;
182	}
183
184	$last = $row;
185	}
186
187	push @m, $last if ($last);
188
189	return @m;
190	}
191
192	=head1 Functions to create C<data_structure>
193
194	Those functions generally have to first in your normalization file.
195
196	=head2 tag
197
198	Define new tag for I<search> and I<display>.
199
200	tag('Title', rec('200','a') );
201
202
203	=cut
204
205	sub tag {
206	my $name = shift or die "tag needs name as first argument";
207	my @o = grep { defined($_) && $_ ne '' } @_;
208	return unless (@o);
209	$out->{$name}->{tag} = $name;
210	$out->{$name}->{search} = \@o;
211	$out->{$name}->{display} = \@o;
212	}
213
214	=head2 display
215
216	Define tag just for I<display>
217
218	@v = display('Title', rec('200','a') );
219
220	=cut
221
222	sub display {
223	my $name = shift or die "display needs name as first argument";
224	my @o = grep { defined($_) && $_ ne '' } @_;
225	return unless (@o);
226	$out->{$name}->{tag} = $name;
227	$out->{$name}->{display} = \@o;
228	}
229
230	=head2 search
231
232	Prepare values just for I<search>
233
234	@v = search('Title', rec('200','a') );
235
236	=cut
237
238	sub search {
239	my $name = shift or die "search needs name as first argument";
240	my @o = grep { defined($_) && $_ ne '' } @_;
241	return unless (@o);
242	$out->{$name}->{tag} = $name;
243	$out->{$name}->{search} = \@o;
244	}
245
246	=head2 marc
247
248	Save value for MARC field
249
250	marc('900','a', rec('200','a') );
251
252	=cut
253
254	sub marc {
255	my $f = shift or die "marc needs field";
256	die "marc field must be numer" unless ($f =~ /^\d+$/);
257
258	my $sf = shift or die "marc needs subfield";
259
260	foreach (@_) {
261	my $v = $_; # make var read-write for Encode
262	next unless (defined($v) && $v !~ /^\s*$/);
263	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
264	my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
265	push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
266	}
267	}
268
269	=head2 marc_repeatable_subfield
270
271	Save values for MARC repetable subfield
272
273	marc_repeatable_subfield('910', 'z', rec('909') );
274
275	=cut
276
277	sub marc_repeatable_subfield {
278	die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
279	$marc_repeatable_subfield->{ $_[1] }++;
280	marc(@_);
281	}
282
283	=head2 marc_indicators
284
285	Set both indicators for MARC field
286
287	marc_indicators('900', ' ', 1);
288
289	Any indicator value other than C<0-9> will be treated as undefined.
290
291	=cut
292
293	sub marc_indicators {
294	my $f = shift \|\| die "marc_indicators need field!\n";
295	my ($i1,$i2) = @_;
296	die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
297	die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
298
299	$i1 = ' ' if ($i1 !~ /^\d$/);
300	$i2 = ' ' if ($i2 !~ /^\d$/);
301	@{ $marc_indicators->{$f} } = ($i1,$i2);
302	}
303
304
305	=head1 Functions to extract data from input
306
307	This function should be used inside functions to create C<data_structure> described
308	above.
309
310	=head2 rec1
311
312	Return all values in some field
313
314	@v = rec1('200')
315
316	TODO: order of values is probably same as in source data, need to investigate that
317
318	=cut
319
320	sub rec1 {
321	my $f = shift;
322	return unless (defined($rec) && defined($rec->{$f}));
323	if (ref($rec->{$f}) eq 'ARRAY') {
324	return map {
325	if (ref($_) eq 'HASH') {
326	values %{$_};
327	} else {
328	$_;
329	}
330	} @{ $rec->{$f} };
331	} elsif( defined($rec->{$f}) ) {
332	return $rec->{$f};
333	}
334	}
335
336	=head2 rec2
337
338	Return all values in specific field and subfield
339
340	@v = rec2('200','a')
341
342	=cut
343
344	sub rec2 {
345	my $f = shift;
346	return unless (defined($rec && $rec->{$f}));
347	my $sf = shift;
348	return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
349	}
350
351	=head2 rec
352
353	syntaxtic sugar for
354
355	@v = rec('200')
356	@v = rec('200','a')
357
358	=cut
359
360	sub rec {
361	if ($#_ == 0) {
362	return rec1(@_);
363	} elsif ($#_ == 1) {
364	return rec2(@_);
365	}
366	}
367
368	=head2 regex
369
370	Apply regex to some or all values
371
372	@v = regex( 's/foo/bar/g', @v );
373
374	=cut
375
376	sub regex {
377	my $r = shift;
378	my @out;
379	#warn "r: $r\n",Dumper(\@_);
380	foreach my $t (@_) {
381	next unless ($t);
382	eval "\$t =~ $r";
383	push @out, $t if ($t && $t ne '');
384	}
385	return @out;
386	}
387
388	=head2 prefix
389
390	Prefix all values with a string
391
392	@v = prefix( 'my_', @v );
393
394	=cut
395
396	sub prefix {
397	my $p = shift or die "prefix needs string as first argument";
398	return map { $p . $_ } grep { defined($_) } @_;
399	}
400
401	=head2 suffix
402
403	suffix all values with a string
404
405	@v = suffix( '_my', @v );
406
407	=cut
408
409	sub suffix {
410	my $s = shift or die "suffix needs string as first argument";
411	return map { $_ . $s } grep { defined($_) } @_;
412	}
413
414	=head2 surround
415
416	surround all values with a two strings
417
418	@v = surround( 'prefix_', '_suffix', @v );
419
420	=cut
421
422	sub surround {
423	my $p = shift or die "surround need prefix as first argument";
424	my $s = shift or die "surround needs suffix as second argument";
425	return map { $p . $_ . $s } grep { defined($_) } @_;
426	}
427
428	=head2 first
429
430	Return first element
431
432	$v = first( @v );
433
434	=cut
435
436	sub first {
437	my $r = shift;
438	return $r;
439	}
440
441	=head2 lookup
442
443	Consult lookup hashes for some value
444
445	@v = lookup( $v );
446	@v = lookup( @v );
447
448	=cut
449
450	sub lookup {
451	my $k = shift or return;
452	return unless (defined($lookup->{$k}));
453	if (ref($lookup->{$k}) eq 'ARRAY') {
454	return @{ $lookup->{$k} };
455	} else {
456	return $lookup->{$k};
457	}
458	}
459
460	=head2 join_with
461
462	Joins walues with some delimiter
463
464	$v = join_with(", ", @v);
465
466	=cut
467
468	sub join_with {
469	my $d = shift;
470	return join($d, grep { defined($_) && $_ ne '' } @_);
471	}
472
473	# END
474	1;