lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds

        tag search display
        marc21

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.05

=cut

our $VERSION = '0.05';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc21>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds();

        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;
my $marc21;

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        $out = undef;
        $marc21 = undef;
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc21_fields

Get all fields defined by calls to C<marc21>

        $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );

=cut

sub _get_marc21_fields {
        return @{$marc21};
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc21

Save value for MARC field

  marc21('900','a', rec('200','a') );

=cut

sub marc21 {
        my $f = shift or die "marc21 needs field";
        die "marc21 field must be numer" unless ($f =~ /^\d+$/);

        my $sf = shift or die "marc21 needs subfield";

        foreach my $v (@_) {
                push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
        }
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        return unless (defined($rec) && defined($rec->{$f}));
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        if ($#_ == 0) {
                return rec1(@_);
        } elsif ($#_ == 1) {
                return rec2(@_);
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n",Dumper(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        return join($d, grep { defined($_) && $_ ne '' } @_);
}

# END
1;
1	dpavlin	10	package WebPAC::Normalize;
2	dpavlin	536	use Exporter 'import';
3			@EXPORT = qw/
4	dpavlin	538	_set_rec _set_lookup
5			_get_ds _clean_ds
6
7	dpavlin	536	tag search display
8	dpavlin	540	marc21
9
10	dpavlin	536	rec1 rec2 rec
11			regex prefix suffix surround
12			first lookup join_with
13			/;
14	dpavlin	10
15			use warnings;
16			use strict;
17	dpavlin	536
18			#use base qw/WebPAC::Common/;
19	dpavlin	13	use Data::Dumper;
20	dpavlin	10
21			=head1 NAME
22
23	dpavlin	536	WebPAC::Normalize - describe normalisaton rules using sets
24	dpavlin	10
25			=head1 VERSION
26
27	dpavlin	538	Version 0.05
28	dpavlin	10
29			=cut
30
31	dpavlin	538	our $VERSION = '0.05';
32	dpavlin	10
33			=head1 SYNOPSIS
34
35	dpavlin	536	This module uses C<conf/normalize/*.pl> files to perform normalisation
36			from input records using perl functions which are specialized for set
37			processing.
38	dpavlin	10
39	dpavlin	536	Sets are implemented as arrays, and normalisation file is valid perl, which
40			means that you check it's validity before running WebPAC using
41			C<perl -c normalize.pl>.
42	dpavlin	15
43	dpavlin	536	Normalisation can generate multiple output normalized data. For now, supported output
44	dpavlin	540	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
45			C<marc21>.
46	dpavlin	15
47	dpavlin	10	=head1 FUNCTIONS
48
49	dpavlin	538	Functions which start with C<_> are private and used by WebPAC internally.
50			All other functions are available for use within normalisation rules.
51
52	dpavlin	536	=head2 data_structure
53	dpavlin	10
54	dpavlin	536	Return data structure
55	dpavlin	13
56	dpavlin	538	my $ds = WebPAC::Normalize::data_structure(
57	dpavlin	536	lookup => $lookup->lookup_hash,
58			row => $row,
59			rules => $normalize_pl_config,
60	dpavlin	13	);
61
62	dpavlin	540	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
63			other are optional.
64
65	dpavlin	536	This function will B<die> if normalizastion can't be evaled.
66	dpavlin	15
67	dpavlin	538	Since this function isn't exported you have to call it with
68			C<WebPAC::Normalize::data_structure>.
69
70	dpavlin	10	=cut
71
72	dpavlin	536	sub data_structure {
73			my $arg = {@_};
74	dpavlin	13
75	dpavlin	536	die "need row argument" unless ($arg->{row});
76			die "need normalisation argument" unless ($arg->{rules});
77	dpavlin	31
78	dpavlin	536	no strict 'subs';
79	dpavlin	538	_set_lookup( $arg->{lookup} );
80			_set_rec( $arg->{row} );
81			_clean_ds();
82	dpavlin	540
83	dpavlin	536	eval "$arg->{rules}";
84			die "error evaling $arg->{rules}: $@\n" if ($@);
85	dpavlin	540
86	dpavlin	538	return _get_ds();
87	dpavlin	10	}
88
89	dpavlin	538	=head2 _set_rec
90	dpavlin	13
91	dpavlin	536	Set current record hash
92	dpavlin	433
93	dpavlin	538	_set_rec( $rec );
94	dpavlin	433
95			=cut
96
97	dpavlin	536	my $rec;
98	dpavlin	433
99	dpavlin	538	sub _set_rec {
100	dpavlin	536	$rec = shift or die "no record hash";
101	dpavlin	433	}
102
103	dpavlin	538	=head2 _get_ds
104
105			Return hash formatted as data structure
106
107			my $ds = _get_ds();
108
109			=cut
110
111			my $out;
112	dpavlin	540	my $marc21;
113	dpavlin	538
114			sub _get_ds {
115			return $out;
116			}
117
118			=head2 _clean_ds
119
120			Clean data structure hash for next record
121
122			_clean_ds();
123
124			=cut
125
126			sub _clean_ds {
127			$out = undef;
128	dpavlin	540	$marc21 = undef;
129	dpavlin	538	}
130
131			=head2 _set_lookup
132
133			Set current lookup hash
134
135			_set_lookup( $lookup );
136
137			=cut
138
139			my $lookup;
140
141			sub _set_lookup {
142			$lookup = shift;
143			}
144
145	dpavlin	540	=head2 _get_marc21_fields
146
147			Get all fields defined by calls to C<marc21>
148
149			$marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
150
151			=cut
152
153			sub _get_marc21_fields {
154			return @{$marc21};
155			}
156
157			=head1 Functions to create C<data_structure>
158
159			Those functions generally have to first in your normalization file.
160
161	dpavlin	536	=head2 tag
162	dpavlin	433
163	dpavlin	536	Define new tag for I<search> and I<display>.
164	dpavlin	433
165	dpavlin	536	tag('Title', rec('200','a') );
166	dpavlin	13
167
168			=cut
169
170	dpavlin	536	sub tag {
171			my $name = shift or die "tag needs name as first argument";
172			my @o = grep { defined($_) && $_ ne '' } @_;
173			return unless (@o);
174			$out->{$name}->{tag} = $name;
175			$out->{$name}->{search} = \@o;
176			$out->{$name}->{display} = \@o;
177			}
178	dpavlin	13
179	dpavlin	536	=head2 display
180	dpavlin	13
181	dpavlin	536	Define tag just for I<display>
182	dpavlin	125
183	dpavlin	536	@v = display('Title', rec('200','a') );
184	dpavlin	125
185	dpavlin	536	=cut
186	dpavlin	125
187	dpavlin	536	sub display {
188			my $name = shift or die "display needs name as first argument";
189			my @o = grep { defined($_) && $_ ne '' } @_;
190			return unless (@o);
191			$out->{$name}->{tag} = $name;
192			$out->{$name}->{display} = \@o;
193			}
194	dpavlin	13
195	dpavlin	536	=head2 search
196	dpavlin	13
197	dpavlin	536	Prepare values just for I<search>
198	dpavlin	13
199	dpavlin	536	@v = search('Title', rec('200','a') );
200	dpavlin	433
201	dpavlin	536	=cut
202	dpavlin	13
203	dpavlin	536	sub search {
204			my $name = shift or die "search needs name as first argument";
205			my @o = grep { defined($_) && $_ ne '' } @_;
206			return unless (@o);
207			$out->{$name}->{tag} = $name;
208			$out->{$name}->{search} = \@o;
209	dpavlin	13	}
210
211	dpavlin	540	=head2 marc21
212
213			Save value for MARC field
214
215			marc21('900','a', rec('200','a') );
216
217			=cut
218
219			sub marc21 {
220			my $f = shift or die "marc21 needs field";
221			die "marc21 field must be numer" unless ($f =~ /^\d+$/);
222
223			my $sf = shift or die "marc21 needs subfield";
224
225			foreach my $v (@_) {
226			push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
227			}
228			}
229
230			=head1 Functions to extract data from input
231
232			This function should be used inside functions to create C<data_structure> described
233			above.
234
235	dpavlin	536	=head2 rec1
236	dpavlin	371
237	dpavlin	536	Return all values in some field
238	dpavlin	371
239	dpavlin	536	@v = rec1('200')
240	dpavlin	15
241	dpavlin	536	TODO: order of values is probably same as in source data, need to investigate that
242	dpavlin	15
243	dpavlin	536	=cut
244	dpavlin	15
245	dpavlin	536	sub rec1 {
246			my $f = shift;
247			return unless (defined($rec) && defined($rec->{$f}));
248			if (ref($rec->{$f}) eq 'ARRAY') {
249			return map {
250			if (ref($_) eq 'HASH') {
251			values %{$_};
252	dpavlin	31	} else {
253	dpavlin	536	$_;
254	dpavlin	31	}
255	dpavlin	536	} @{ $rec->{$f} };
256			} elsif( defined($rec->{$f}) ) {
257			return $rec->{$f};
258	dpavlin	15	}
259			}
260
261	dpavlin	536	=head2 rec2
262	dpavlin	15
263	dpavlin	536	Return all values in specific field and subfield
264	dpavlin	13
265	dpavlin	536	@v = rec2('200','a')
266	dpavlin	13
267			=cut
268
269	dpavlin	536	sub rec2 {
270			my $f = shift;
271			return unless (defined($rec && $rec->{$f}));
272			my $sf = shift;
273			return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
274			}
275	dpavlin	13
276	dpavlin	536	=head2 rec
277	dpavlin	13
278	dpavlin	536	syntaxtic sugar for
279	dpavlin	13
280	dpavlin	536	@v = rec('200')
281			@v = rec('200','a')
282	dpavlin	13
283	dpavlin	536	=cut
284	dpavlin	373
285	dpavlin	536	sub rec {
286			if ($#_ == 0) {
287			return rec1(@_);
288			} elsif ($#_ == 1) {
289			return rec2(@_);
290	dpavlin	13	}
291			}
292
293	dpavlin	536	=head2 regex
294	dpavlin	15
295	dpavlin	536	Apply regex to some or all values
296	dpavlin	15
297	dpavlin	536	@v = regex( 's/foo/bar/g', @v );
298	dpavlin	15
299			=cut
300
301	dpavlin	536	sub regex {
302			my $r = shift;
303			my @out;
304			#warn "r: $r\n",Dumper(\@_);
305			foreach my $t (@_) {
306			next unless ($t);
307			eval "\$t =~ $r";
308			push @out, $t if ($t && $t ne '');
309	dpavlin	368	}
310	dpavlin	536	return @out;
311	dpavlin	15	}
312
313	dpavlin	536	=head2 prefix
314	dpavlin	15
315	dpavlin	536	Prefix all values with a string
316	dpavlin	15
317	dpavlin	536	@v = prefix( 'my_', @v );
318	dpavlin	15
319			=cut
320
321	dpavlin	536	sub prefix {
322			my $p = shift or die "prefix needs string as first argument";
323			return map { $p . $_ } grep { defined($_) } @_;
324			}
325	dpavlin	15
326	dpavlin	536	=head2 suffix
327	dpavlin	15
328	dpavlin	536	suffix all values with a string
329	dpavlin	15
330	dpavlin	536	@v = suffix( '_my', @v );
331	dpavlin	15
332	dpavlin	536	=cut
333	dpavlin	15
334	dpavlin	536	sub suffix {
335			my $s = shift or die "suffix needs string as first argument";
336			return map { $_ . $s } grep { defined($_) } @_;
337	dpavlin	15	}
338
339	dpavlin	536	=head2 surround
340	dpavlin	13
341	dpavlin	536	surround all values with a two strings
342	dpavlin	13
343	dpavlin	536	@v = surround( 'prefix_', '_suffix', @v );
344	dpavlin	13
345			=cut
346
347	dpavlin	536	sub surround {
348			my $p = shift or die "surround need prefix as first argument";
349			my $s = shift or die "surround needs suffix as second argument";
350			return map { $p . $_ . $s } grep { defined($_) } @_;
351	dpavlin	13	}
352
353	dpavlin	536	=head2 first
354	dpavlin	13
355	dpavlin	536	Return first element
356	dpavlin	15
357	dpavlin	536	$v = first( @v );
358	dpavlin	13
359			=cut
360
361	dpavlin	536	sub first {
362			my $r = shift;
363			return $r;
364	dpavlin	13	}
365
366	dpavlin	536	=head2 lookup
367	dpavlin	13
368	dpavlin	536	Consult lookup hashes for some value
369	dpavlin	13
370	dpavlin	536	@v = lookup( $v );
371			@v = lookup( @v );
372	dpavlin	13
373			=cut
374
375	dpavlin	536	sub lookup {
376			my $k = shift or return;
377			return unless (defined($lookup->{$k}));
378			if (ref($lookup->{$k}) eq 'ARRAY') {
379			return @{ $lookup->{$k} };
380			} else {
381			return $lookup->{$k};
382			}
383	dpavlin	13	}
384
385	dpavlin	536	=head2 join_with
386	dpavlin	13
387	dpavlin	536	Joins walues with some delimiter
388	dpavlin	10
389	dpavlin	536	$v = join_with(", ", @v);
390	dpavlin	10
391	dpavlin	536	=cut
392	dpavlin	10
393	dpavlin	536	sub join_with {
394			my $d = shift;
395			return join($d, grep { defined($_) && $_ ne '' } @_);
396			}
397	dpavlin	10
398	dpavlin	536	# END
399			1;