lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds

        tag search display
        marc21

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dumper;
use Encode qw/from_to/;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.06

=cut

our $VERSION = '0.06';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc21>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds( %{ $arg } );
        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;
my $marc21;
my $marc_encoding;

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        $marc21 = undef;
        $marc_encoding = $a->{marc_encoding};
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc21_fields

Get all fields defined by calls to C<marc21>

        $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );

B<TODO>: implement exceptions to magic which unrolls repeated subfields
as new field with that subfield.

=cut

sub _get_marc21_fields {
        my @m;
        my $last;
        foreach my $row (@{ $marc21 }) {
                if ($last && (
                        $last->[0] eq $row->[0] ||              # check if field is same
                        $last->[1] eq $row->[1] ||              # check for i1
                        $last->[2] eq $row->[2]                 # and for i2
                ) ) {
                        $last->[3]->{ $row->[3] } = $row->[4];
                        warn "## ++ added $row->[0] ^$row->[3]\n";
                        next;
                } elsif ($last) {
                        push @m, $last;
                }

                push @m, $row;
        }

        push @m, $last if ($last);

        return @m;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc21

Save value for MARC field

  marc21('900','a', rec('200','a') );

=cut

sub marc21 {
        my $f = shift or die "marc21 needs field";
        die "marc21 field must be numer" unless ($f =~ /^\d+$/);

        my $sf = shift or die "marc21 needs subfield";

        foreach (@_) {
                my $v = $_;             # make var read-write for Encode
                next unless (defined($v) && $v !~ /^\s+$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
        }
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        return unless (defined($rec) && defined($rec->{$f}));
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        if ($#_ == 0) {
                return rec1(@_);
        } elsif ($#_ == 1) {
                return rec2(@_);
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n",Dumper(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        return join($d, grep { defined($_) && $_ ne '' } @_);
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	@EXPORT = qw/
4	_set_rec _set_lookup
5	_get_ds _clean_ds
6
7	tag search display
8	marc21
9
10	rec1 rec2 rec
11	regex prefix suffix surround
12	first lookup join_with
13	/;
14
15	use warnings;
16	use strict;
17
18	#use base qw/WebPAC::Common/;
19	use Data::Dumper;
20	use Encode qw/from_to/;
21
22	=head1 NAME
23
24	WebPAC::Normalize - describe normalisaton rules using sets
25
26	=head1 VERSION
27
28	Version 0.06
29
30	=cut
31
32	our $VERSION = '0.06';
33
34	=head1 SYNOPSIS
35
36	This module uses C<conf/normalize/*.pl> files to perform normalisation
37	from input records using perl functions which are specialized for set
38	processing.
39
40	Sets are implemented as arrays, and normalisation file is valid perl, which
41	means that you check it's validity before running WebPAC using
42	C<perl -c normalize.pl>.
43
44	Normalisation can generate multiple output normalized data. For now, supported output
45	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46	C<marc21>.
47
48	=head1 FUNCTIONS
49
50	Functions which start with C<_> are private and used by WebPAC internally.
51	All other functions are available for use within normalisation rules.
52
53	=head2 data_structure
54
55	Return data structure
56
57	my $ds = WebPAC::Normalize::data_structure(
58	lookup => $lookup->lookup_hash,
59	row => $row,
60	rules => $normalize_pl_config,
61	marc_encoding => 'utf-8',
62	);
63
64	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65	other are optional.
66
67	This function will B<die> if normalizastion can't be evaled.
68
69	Since this function isn't exported you have to call it with
70	C<WebPAC::Normalize::data_structure>.
71
72	=cut
73
74	sub data_structure {
75	my $arg = {@_};
76
77	die "need row argument" unless ($arg->{row});
78	die "need normalisation argument" unless ($arg->{rules});
79
80	no strict 'subs';
81	_set_lookup( $arg->{lookup} );
82	_set_rec( $arg->{row} );
83	_clean_ds( %{ $arg } );
84	eval "$arg->{rules}";
85	die "error evaling $arg->{rules}: $@\n" if ($@);
86
87	return _get_ds();
88	}
89
90	=head2 _set_rec
91
92	Set current record hash
93
94	_set_rec( $rec );
95
96	=cut
97
98	my $rec;
99
100	sub _set_rec {
101	$rec = shift or die "no record hash";
102	}
103
104	=head2 _get_ds
105
106	Return hash formatted as data structure
107
108	my $ds = _get_ds();
109
110	=cut
111
112	my $out;
113	my $marc21;
114	my $marc_encoding;
115
116	sub _get_ds {
117	return $out;
118	}
119
120	=head2 _clean_ds
121
122	Clean data structure hash for next record
123
124	_clean_ds();
125
126	=cut
127
128	sub _clean_ds {
129	my $a = {@_};
130	$out = undef;
131	$marc21 = undef;
132	$marc_encoding = $a->{marc_encoding};
133	}
134
135	=head2 _set_lookup
136
137	Set current lookup hash
138
139	_set_lookup( $lookup );
140
141	=cut
142
143	my $lookup;
144
145	sub _set_lookup {
146	$lookup = shift;
147	}
148
149	=head2 _get_marc21_fields
150
151	Get all fields defined by calls to C<marc21>
152
153	$marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154
155	B<TODO>: implement exceptions to magic which unrolls repeated subfields
156	as new field with that subfield.
157
158	=cut
159
160	sub _get_marc21_fields {
161	my @m;
162	my $last;
163	foreach my $row (@{ $marc21 }) {
164	if ($last && (
165	$last->[0] eq $row->[0] \|\| # check if field is same
166	$last->[1] eq $row->[1] \|\| # check for i1
167	$last->[2] eq $row->[2] # and for i2
168	) ) {
169	$last->[3]->{ $row->[3] } = $row->[4];
170	warn "## ++ added $row->[0] ^$row->[3]\n";
171	next;
172	} elsif ($last) {
173	push @m, $last;
174	}
175
176	push @m, $row;
177	}
178
179	push @m, $last if ($last);
180
181	return @m;
182	}
183
184	=head1 Functions to create C<data_structure>
185
186	Those functions generally have to first in your normalization file.
187
188	=head2 tag
189
190	Define new tag for I<search> and I<display>.
191
192	tag('Title', rec('200','a') );
193
194
195	=cut
196
197	sub tag {
198	my $name = shift or die "tag needs name as first argument";
199	my @o = grep { defined($_) && $_ ne '' } @_;
200	return unless (@o);
201	$out->{$name}->{tag} = $name;
202	$out->{$name}->{search} = \@o;
203	$out->{$name}->{display} = \@o;
204	}
205
206	=head2 display
207
208	Define tag just for I<display>
209
210	@v = display('Title', rec('200','a') );
211
212	=cut
213
214	sub display {
215	my $name = shift or die "display needs name as first argument";
216	my @o = grep { defined($_) && $_ ne '' } @_;
217	return unless (@o);
218	$out->{$name}->{tag} = $name;
219	$out->{$name}->{display} = \@o;
220	}
221
222	=head2 search
223
224	Prepare values just for I<search>
225
226	@v = search('Title', rec('200','a') );
227
228	=cut
229
230	sub search {
231	my $name = shift or die "search needs name as first argument";
232	my @o = grep { defined($_) && $_ ne '' } @_;
233	return unless (@o);
234	$out->{$name}->{tag} = $name;
235	$out->{$name}->{search} = \@o;
236	}
237
238	=head2 marc21
239
240	Save value for MARC field
241
242	marc21('900','a', rec('200','a') );
243
244	=cut
245
246	sub marc21 {
247	my $f = shift or die "marc21 needs field";
248	die "marc21 field must be numer" unless ($f =~ /^\d+$/);
249
250	my $sf = shift or die "marc21 needs subfield";
251
252	foreach (@_) {
253	my $v = $_; # make var read-write for Encode
254	next unless (defined($v) && $v !~ /^\s+$/);
255	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
256	push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
257	}
258	}
259
260	=head1 Functions to extract data from input
261
262	This function should be used inside functions to create C<data_structure> described
263	above.
264
265	=head2 rec1
266
267	Return all values in some field
268
269	@v = rec1('200')
270
271	TODO: order of values is probably same as in source data, need to investigate that
272
273	=cut
274
275	sub rec1 {
276	my $f = shift;
277	return unless (defined($rec) && defined($rec->{$f}));
278	if (ref($rec->{$f}) eq 'ARRAY') {
279	return map {
280	if (ref($_) eq 'HASH') {
281	values %{$_};
282	} else {
283	$_;
284	}
285	} @{ $rec->{$f} };
286	} elsif( defined($rec->{$f}) ) {
287	return $rec->{$f};
288	}
289	}
290
291	=head2 rec2
292
293	Return all values in specific field and subfield
294
295	@v = rec2('200','a')
296
297	=cut
298
299	sub rec2 {
300	my $f = shift;
301	return unless (defined($rec && $rec->{$f}));
302	my $sf = shift;
303	return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
304	}
305
306	=head2 rec
307
308	syntaxtic sugar for
309
310	@v = rec('200')
311	@v = rec('200','a')
312
313	=cut
314
315	sub rec {
316	if ($#_ == 0) {
317	return rec1(@_);
318	} elsif ($#_ == 1) {
319	return rec2(@_);
320	}
321	}
322
323	=head2 regex
324
325	Apply regex to some or all values
326
327	@v = regex( 's/foo/bar/g', @v );
328
329	=cut
330
331	sub regex {
332	my $r = shift;
333	my @out;
334	#warn "r: $r\n",Dumper(\@_);
335	foreach my $t (@_) {
336	next unless ($t);
337	eval "\$t =~ $r";
338	push @out, $t if ($t && $t ne '');
339	}
340	return @out;
341	}
342
343	=head2 prefix
344
345	Prefix all values with a string
346
347	@v = prefix( 'my_', @v );
348
349	=cut
350
351	sub prefix {
352	my $p = shift or die "prefix needs string as first argument";
353	return map { $p . $_ } grep { defined($_) } @_;
354	}
355
356	=head2 suffix
357
358	suffix all values with a string
359
360	@v = suffix( '_my', @v );
361
362	=cut
363
364	sub suffix {
365	my $s = shift or die "suffix needs string as first argument";
366	return map { $_ . $s } grep { defined($_) } @_;
367	}
368
369	=head2 surround
370
371	surround all values with a two strings
372
373	@v = surround( 'prefix_', '_suffix', @v );
374
375	=cut
376
377	sub surround {
378	my $p = shift or die "surround need prefix as first argument";
379	my $s = shift or die "surround needs suffix as second argument";
380	return map { $p . $_ . $s } grep { defined($_) } @_;
381	}
382
383	=head2 first
384
385	Return first element
386
387	$v = first( @v );
388
389	=cut
390
391	sub first {
392	my $r = shift;
393	return $r;
394	}
395
396	=head2 lookup
397
398	Consult lookup hashes for some value
399
400	@v = lookup( $v );
401	@v = lookup( @v );
402
403	=cut
404
405	sub lookup {
406	my $k = shift or return;
407	return unless (defined($lookup->{$k}));
408	if (ref($lookup->{$k}) eq 'ARRAY') {
409	return @{ $lookup->{$k} };
410	} else {
411	return $lookup->{$k};
412	}
413	}
414
415	=head2 join_with
416
417	Joins walues with some delimiter
418
419	$v = join_with(", ", @v);
420
421	=cut
422
423	sub join_with {
424	my $d = shift;
425	return join($d, grep { defined($_) && $_ ne '' } @_);
426	}
427
428	# END
429	1;