lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds

        tag search display
        marc21

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.05

=cut

our $VERSION = '0.05';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc21>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds();

        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;
my $marc21;

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        $out = undef;
        $marc21 = undef;
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc21_fields

Get all fields defined by calls to C<marc21>

        $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );

=cut

sub _get_marc21_fields {
        return @{$marc21};
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc21

Save value for MARC field

  marc21('900','a', rec('200','a') );

=cut

sub marc21 {
        my $f = shift or die "marc21 needs field";
        die "marc21 field must be numer" unless ($f =~ /^\d+$/);

        my $sf = shift or die "marc21 needs subfield";

        foreach my $v (@_) {
                push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
        }
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        return unless (defined($rec) && defined($rec->{$f}));
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        if ($#_ == 0) {
                return rec1(@_);
        } elsif ($#_ == 1) {
                return rec2(@_);
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n",Dumper(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        return join($d, grep { defined($_) && $_ ne '' } @_);
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	@EXPORT = qw/
4	_set_rec _set_lookup
5	_get_ds _clean_ds
6
7	tag search display
8	marc21
9
10	rec1 rec2 rec
11	regex prefix suffix surround
12	first lookup join_with
13	/;
14
15	use warnings;
16	use strict;
17
18	#use base qw/WebPAC::Common/;
19	use Data::Dumper;
20
21	=head1 NAME
22
23	WebPAC::Normalize - describe normalisaton rules using sets
24
25	=head1 VERSION
26
27	Version 0.05
28
29	=cut
30
31	our $VERSION = '0.05';
32
33	=head1 SYNOPSIS
34
35	This module uses C<conf/normalize/*.pl> files to perform normalisation
36	from input records using perl functions which are specialized for set
37	processing.
38
39	Sets are implemented as arrays, and normalisation file is valid perl, which
40	means that you check it's validity before running WebPAC using
41	C<perl -c normalize.pl>.
42
43	Normalisation can generate multiple output normalized data. For now, supported output
44	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
45	C<marc21>.
46
47	=head1 FUNCTIONS
48
49	Functions which start with C<_> are private and used by WebPAC internally.
50	All other functions are available for use within normalisation rules.
51
52	=head2 data_structure
53
54	Return data structure
55
56	my $ds = WebPAC::Normalize::data_structure(
57	lookup => $lookup->lookup_hash,
58	row => $row,
59	rules => $normalize_pl_config,
60	);
61
62	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
63	other are optional.
64
65	This function will B<die> if normalizastion can't be evaled.
66
67	Since this function isn't exported you have to call it with
68	C<WebPAC::Normalize::data_structure>.
69
70	=cut
71
72	sub data_structure {
73	my $arg = {@_};
74
75	die "need row argument" unless ($arg->{row});
76	die "need normalisation argument" unless ($arg->{rules});
77
78	no strict 'subs';
79	_set_lookup( $arg->{lookup} );
80	_set_rec( $arg->{row} );
81	_clean_ds();
82
83	eval "$arg->{rules}";
84	die "error evaling $arg->{rules}: $@\n" if ($@);
85
86	return _get_ds();
87	}
88
89	=head2 _set_rec
90
91	Set current record hash
92
93	_set_rec( $rec );
94
95	=cut
96
97	my $rec;
98
99	sub _set_rec {
100	$rec = shift or die "no record hash";
101	}
102
103	=head2 _get_ds
104
105	Return hash formatted as data structure
106
107	my $ds = _get_ds();
108
109	=cut
110
111	my $out;
112	my $marc21;
113
114	sub _get_ds {
115	return $out;
116	}
117
118	=head2 _clean_ds
119
120	Clean data structure hash for next record
121
122	_clean_ds();
123
124	=cut
125
126	sub _clean_ds {
127	$out = undef;
128	$marc21 = undef;
129	}
130
131	=head2 _set_lookup
132
133	Set current lookup hash
134
135	_set_lookup( $lookup );
136
137	=cut
138
139	my $lookup;
140
141	sub _set_lookup {
142	$lookup = shift;
143	}
144
145	=head2 _get_marc21_fields
146
147	Get all fields defined by calls to C<marc21>
148
149	$marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
150
151	=cut
152
153	sub _get_marc21_fields {
154	return @{$marc21};
155	}
156
157	=head1 Functions to create C<data_structure>
158
159	Those functions generally have to first in your normalization file.
160
161	=head2 tag
162
163	Define new tag for I<search> and I<display>.
164
165	tag('Title', rec('200','a') );
166
167
168	=cut
169
170	sub tag {
171	my $name = shift or die "tag needs name as first argument";
172	my @o = grep { defined($_) && $_ ne '' } @_;
173	return unless (@o);
174	$out->{$name}->{tag} = $name;
175	$out->{$name}->{search} = \@o;
176	$out->{$name}->{display} = \@o;
177	}
178
179	=head2 display
180
181	Define tag just for I<display>
182
183	@v = display('Title', rec('200','a') );
184
185	=cut
186
187	sub display {
188	my $name = shift or die "display needs name as first argument";
189	my @o = grep { defined($_) && $_ ne '' } @_;
190	return unless (@o);
191	$out->{$name}->{tag} = $name;
192	$out->{$name}->{display} = \@o;
193	}
194
195	=head2 search
196
197	Prepare values just for I<search>
198
199	@v = search('Title', rec('200','a') );
200
201	=cut
202
203	sub search {
204	my $name = shift or die "search needs name as first argument";
205	my @o = grep { defined($_) && $_ ne '' } @_;
206	return unless (@o);
207	$out->{$name}->{tag} = $name;
208	$out->{$name}->{search} = \@o;
209	}
210
211	=head2 marc21
212
213	Save value for MARC field
214
215	marc21('900','a', rec('200','a') );
216
217	=cut
218
219	sub marc21 {
220	my $f = shift or die "marc21 needs field";
221	die "marc21 field must be numer" unless ($f =~ /^\d+$/);
222
223	my $sf = shift or die "marc21 needs subfield";
224
225	foreach my $v (@_) {
226	push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
227	}
228	}
229
230	=head1 Functions to extract data from input
231
232	This function should be used inside functions to create C<data_structure> described
233	above.
234
235	=head2 rec1
236
237	Return all values in some field
238
239	@v = rec1('200')
240
241	TODO: order of values is probably same as in source data, need to investigate that
242
243	=cut
244
245	sub rec1 {
246	my $f = shift;
247	return unless (defined($rec) && defined($rec->{$f}));
248	if (ref($rec->{$f}) eq 'ARRAY') {
249	return map {
250	if (ref($_) eq 'HASH') {
251	values %{$_};
252	} else {
253	$_;
254	}
255	} @{ $rec->{$f} };
256	} elsif( defined($rec->{$f}) ) {
257	return $rec->{$f};
258	}
259	}
260
261	=head2 rec2
262
263	Return all values in specific field and subfield
264
265	@v = rec2('200','a')
266
267	=cut
268
269	sub rec2 {
270	my $f = shift;
271	return unless (defined($rec && $rec->{$f}));
272	my $sf = shift;
273	return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
274	}
275
276	=head2 rec
277
278	syntaxtic sugar for
279
280	@v = rec('200')
281	@v = rec('200','a')
282
283	=cut
284
285	sub rec {
286	if ($#_ == 0) {
287	return rec1(@_);
288	} elsif ($#_ == 1) {
289	return rec2(@_);
290	}
291	}
292
293	=head2 regex
294
295	Apply regex to some or all values
296
297	@v = regex( 's/foo/bar/g', @v );
298
299	=cut
300
301	sub regex {
302	my $r = shift;
303	my @out;
304	#warn "r: $r\n",Dumper(\@_);
305	foreach my $t (@_) {
306	next unless ($t);
307	eval "\$t =~ $r";
308	push @out, $t if ($t && $t ne '');
309	}
310	return @out;
311	}
312
313	=head2 prefix
314
315	Prefix all values with a string
316
317	@v = prefix( 'my_', @v );
318
319	=cut
320
321	sub prefix {
322	my $p = shift or die "prefix needs string as first argument";
323	return map { $p . $_ } grep { defined($_) } @_;
324	}
325
326	=head2 suffix
327
328	suffix all values with a string
329
330	@v = suffix( '_my', @v );
331
332	=cut
333
334	sub suffix {
335	my $s = shift or die "suffix needs string as first argument";
336	return map { $_ . $s } grep { defined($_) } @_;
337	}
338
339	=head2 surround
340
341	surround all values with a two strings
342
343	@v = surround( 'prefix_', '_suffix', @v );
344
345	=cut
346
347	sub surround {
348	my $p = shift or die "surround need prefix as first argument";
349	my $s = shift or die "surround needs suffix as second argument";
350	return map { $p . $_ . $s } grep { defined($_) } @_;
351	}
352
353	=head2 first
354
355	Return first element
356
357	$v = first( @v );
358
359	=cut
360
361	sub first {
362	my $r = shift;
363	return $r;
364	}
365
366	=head2 lookup
367
368	Consult lookup hashes for some value
369
370	@v = lookup( $v );
371	@v = lookup( @v );
372
373	=cut
374
375	sub lookup {
376	my $k = shift or return;
377	return unless (defined($lookup->{$k}));
378	if (ref($lookup->{$k}) eq 'ARRAY') {
379	return @{ $lookup->{$k} };
380	} else {
381	return $lookup->{$k};
382	}
383	}
384
385	=head2 join_with
386
387	Joins walues with some delimiter
388
389	$v = join_with(", ", @v);
390
391	=cut
392
393	sub join_with {
394	my $d = shift;
395	return join($d, grep { defined($_) && $_ ne '' } @_);
396	}
397
398	# END
399	1;