/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 436 - (hide annotations)
Sun Apr 30 12:17:19 2006 UTC (16 years, 9 months ago) by dpavlin
File size: 18199 byte(s)
 r531@llin:  dpavlin | 2006-04-30 14:18:00 +0200
 fix warning on undef vars

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 433 Version 0.09
17 dpavlin 10
18     =cut
19    
20 dpavlin 433 our $VERSION = '0.09';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 433 =head2 all_tags
141 dpavlin 13
142 dpavlin 433 Returns all tags in document in specified order
143    
144     my $sorted_tags = $self->all_tags();
145    
146     =cut
147    
148     sub all_tags {
149     my $self = shift;
150    
151     if (! $self->{_tags_by_order}) {
152    
153     my $log = $self->_get_logger;
154     # sanity check
155     $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
156    
157     my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
158     $log->debug("unsorted tags: " . join(", ", @tags));
159    
160     @tags = sort { $self->_sort_by_order } @tags;
161    
162     $log->debug("sorted tags: " . join(",", @tags) );
163    
164     $self->{_tags_by_order} = \@tags;
165     }
166    
167     return $self->{_tags_by_order};
168     }
169    
170    
171    
172 dpavlin 13 =head2 data_structure
173    
174     Create in-memory data structure which represents normalized layout from
175     C<conf/normalize/*.xml>.
176    
177     This structures are used to produce output.
178    
179 dpavlin 70 my $ds = $webpac->data_structure($rec);
180 dpavlin 13
181     =cut
182    
183     sub data_structure {
184     my $self = shift;
185    
186     my $log = $self->_get_logger();
187    
188     my $rec = shift;
189     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
190    
191 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
192    
193 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
194 dpavlin 125
195 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
196 dpavlin 125
197 dpavlin 13 my $cache_file;
198    
199 dpavlin 18 if ($self->{'db'}) {
200 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
201 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
202     return $ds if ($ds);
203 dpavlin 29 $log->debug("cache miss, creating");
204 dpavlin 13 }
205    
206 dpavlin 433 my $tags = $self->all_tags();
207 dpavlin 13
208 dpavlin 433 $log->debug("tags: ",sub { join(", ",@{ $tags }) });
209    
210 dpavlin 70 my $ds;
211 dpavlin 13
212 dpavlin 433 foreach my $field (@{ $tags }) {
213 dpavlin 13
214     my $row;
215    
216     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
217    
218     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
219 dpavlin 38 my $format;
220 dpavlin 13
221 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
222     $format = $tag->{'value'} || $tag->{'content'};
223    
224 dpavlin 13 my @v;
225     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
226 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'fill_in');
227 dpavlin 13 } else {
228 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'parse');
229 dpavlin 13 }
230 dpavlin 364 if (! @v) {
231     $log->debug("$field <",$self->{tag},"> format: $format no values");
232 dpavlin 373 next;
233 dpavlin 364 } else {
234     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
235     }
236 dpavlin 13
237     if ($tag->{'sort'}) {
238     @v = $self->sort_arr(@v);
239     }
240    
241     # use format?
242     if ($tag->{'format_name'}) {
243     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
244     }
245    
246     # delimiter will join repeatable fields
247     if ($tag->{'delimiter'}) {
248     @v = ( join($tag->{'delimiter'}, @v) );
249     }
250    
251     # default types
252 dpavlin 74 my @types = qw(display search);
253 dpavlin 13 # override by type attribute
254     @types = ( $tag->{'type'} ) if ($tag->{'type'});
255    
256     foreach my $type (@types) {
257     # append to previous line?
258 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
259 dpavlin 13 if ($tag->{'append'}) {
260    
261     # I will delimit appended part with
262     # delimiter (or ,)
263     my $d = $tag->{'delimiter'};
264     # default delimiter
265     $d ||= " ";
266    
267     my $last = pop @{$row->{$type}};
268     $d = "" if (! $last);
269     $last .= $d . join($d, @v);
270     push @{$row->{$type}}, $last;
271    
272     } else {
273     push @{$row->{$type}}, @v;
274     }
275     }
276    
277    
278     }
279    
280     if ($row) {
281     $row->{'tag'} = $field;
282    
283     # TODO: name_sigular, name_plural
284     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
285 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
286 dpavlin 13
287     # post-sort all values in field
288     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
289     $log->warn("sort at field tag not implemented");
290     }
291    
292 dpavlin 70 $ds->{$row_name} = $row;
293 dpavlin 13
294     $log->debug("row $field: ",sub { Dumper($row) });
295     }
296    
297     }
298    
299 dpavlin 22 $self->{'db'}->save_ds(
300 dpavlin 219 id => $id,
301 dpavlin 70 ds => $ds,
302 dpavlin 219 prefix => $self->{prefix},
303 dpavlin 18 ) if ($self->{'db'});
304 dpavlin 13
305 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
306 dpavlin 29
307 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
308 dpavlin 13
309 dpavlin 70 return $ds;
310    
311 dpavlin 13 }
312    
313     =head2 parse
314    
315     Perform smart parsing of string, skipping delimiters for fields which aren't
316     defined. It can also eval code in format starting with C<eval{...}> and
317     return output or nothing depending on eval code.
318    
319     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
320    
321 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
322    
323     filter{name_of_filter}
324    
325     but, filters can also have variable number of parametars like this:
326    
327     filter{name_of_filter(param,param,param)}
328    
329 dpavlin 13 =cut
330    
331 dpavlin 261 my $warn_once;
332    
333 dpavlin 13 sub parse {
334     my $self = shift;
335    
336 dpavlin 371 my ($rec, $format_utf8, $i, $rec_size) = @_;
337 dpavlin 13
338     return if (! $format_utf8);
339    
340     my $log = $self->_get_logger();
341    
342     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
343    
344     $i = 0 if (! $i);
345    
346     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
347    
348     my @out;
349    
350 dpavlin 340 $log->debug("format: $format [$i]");
351 dpavlin 13
352     my $eval_code;
353     # remove eval{...} from beginning
354     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
355    
356     my $filter_name;
357     # remove filter{...} from beginning
358     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
359    
360 dpavlin 317 # did we found any (att all) field from format in row?
361 dpavlin 340 my $found_any;
362 dpavlin 317 # prefix before first field which we preserve it $found_any
363 dpavlin 13 my $prefix;
364    
365 dpavlin 317 my $f_step = 1;
366    
367 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
368    
369     my $del = $1 || '';
370 dpavlin 317 $prefix = $del if ($f_step == 1);
371 dpavlin 13
372 dpavlin 340 my $fld_type = lc($2);
373    
374 dpavlin 13 # repeatable index
375     my $r = $i;
376 dpavlin 340 if ($fld_type eq 's') {
377     if ($found_any->{'v'}) {
378     $r = 0;
379     } else {
380     return;
381     }
382     }
383 dpavlin 13
384     my $found = 0;
385 dpavlin 371 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
386 dpavlin 13
387     if ($found) {
388 dpavlin 340 $found_any->{$fld_type} += $found;
389 dpavlin 317
390     # we will skip delimiter before first occurence of field!
391 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
392 dpavlin 436 push @out, $tmp if ($tmp);
393 dpavlin 13 }
394 dpavlin 317 $f_step++;
395 dpavlin 13 }
396    
397 dpavlin 340 # test if any fields found?
398     return if (! $found_any->{'v'} && ! $found_any->{'s'});
399 dpavlin 13
400     my $out = join('',@out);
401    
402     if ($out) {
403     # add rest of format (suffix)
404     $out .= $format;
405    
406     # add prefix if not there
407     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
408    
409     $log->debug("result: $out");
410     }
411    
412     if ($eval_code) {
413     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
414     $log->debug("about to eval{$eval} format: $out");
415     return if (! $self->_eval($eval));
416     }
417    
418 dpavlin 260 if ($filter_name) {
419     my @filter_args;
420     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
421     @filter_args = split(/,/, $2);
422     }
423     if ($self->{'filter'}->{$filter_name}) {
424     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
425     unshift @filter_args, $out;
426     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
427     return unless(defined($out));
428     $log->debug("filter result: $out");
429 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
430 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
431 dpavlin 261 $warn_once->{$filter_name}++;
432 dpavlin 260 }
433 dpavlin 13 }
434    
435     return $out;
436     }
437    
438 dpavlin 15 =head2 fill_in
439    
440     Workhourse of all: takes record from in-memory structure of database and
441     strings with placeholders and returns string or array of with substituted
442     values from record.
443    
444     my $text = $webpac->fill_in($rec,'v250^a');
445    
446     Optional argument is ordinal number for repeatable fields. By default,
447     it's assume to be first repeatable field (fields are perl array, so first
448     element is 0).
449     Following example will read second value from repeatable field.
450    
451     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
452    
453     This function B<does not> perform parsing of format to inteligenty skip
454     delimiters before fields which aren't used.
455    
456     This method will automatically decode UTF-8 string to local code page
457     if needed.
458    
459 dpavlin 371 There is optional parametar C<$record_size> which can be used to get sizes of
460     all C<field^subfield> combinations in this format.
461    
462     my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
463    
464 dpavlin 15 =cut
465    
466     sub fill_in {
467     my $self = shift;
468    
469     my $log = $self->_get_logger();
470    
471 dpavlin 371 my ($rec,$format,$i,$rec_size) = @_;
472    
473     $log->logconfess("need data record") unless ($rec);
474     $log->logconfess("need format to parse") unless($format);
475    
476 dpavlin 15 # iteration (for repeatable fields)
477 dpavlin 371 $i ||= 0;
478 dpavlin 15
479     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
480    
481     # FIXME remove for speedup?
482     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
483    
484     if (utf8::is_utf8($format)) {
485     $format = $self->_x($format);
486     }
487    
488     my $found = 0;
489 dpavlin 333 my $just_single = 1;
490 dpavlin 15
491     my $eval_code;
492     # remove eval{...} from beginning
493     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
494    
495     my $filter_name;
496     # remove filter{...} from beginning
497     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
498    
499 dpavlin 397 {
500     # fix warnings
501     no warnings 'uninitialized';
502 dpavlin 364
503 dpavlin 397 # do actual replacement of placeholders
504     # repeatable fields
505     if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
506     $just_single = 0;
507     }
508    
509     # non-repeatable fields
510     if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
511     return if ($i > 0 && $just_single);
512     }
513 dpavlin 333 }
514 dpavlin 15
515     if ($found) {
516     $log->debug("format: $format");
517     if ($eval_code) {
518     my $eval = $self->fill_in($rec,$eval_code,$i);
519     return if (! $self->_eval($eval));
520     }
521     if ($filter_name && $self->{'filter'}->{$filter_name}) {
522     $log->debug("filter '$filter_name' for $format");
523     $format = $self->{'filter'}->{$filter_name}->($format);
524     return unless(defined($format));
525     $log->debug("filter result: $format");
526     }
527     # do we have lookups?
528     if ($self->{'lookup'}) {
529 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
530 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
531 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
532 dpavlin 252 return @lookup;
533 dpavlin 31 } else {
534     $log->warn("Have lookup object but can't invoke lookup method");
535     }
536 dpavlin 15 } else {
537     return $format;
538     }
539     } else {
540     return;
541     }
542     }
543    
544    
545 dpavlin 372 =head2 _rec_to_arr
546 dpavlin 13
547 dpavlin 372 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
548 dpavlin 13 for fields which have lookups, so they shouldn't be parsed but rather
549 dpavlin 372 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
550 dpavlin 13
551 dpavlin 372 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
552 dpavlin 13
553     =cut
554    
555 dpavlin 372 sub _rec_to_arr {
556 dpavlin 13 my $self = shift;
557    
558 dpavlin 372 my ($rec, $format_utf8, $code) = @_;
559 dpavlin 13
560     my $log = $self->_get_logger();
561    
562     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
563     return if (! $format_utf8);
564    
565 dpavlin 373 $log->debug("using $code on $format_utf8");
566    
567 dpavlin 13 my $i = 0;
568 dpavlin 372 my $max = 0;
569 dpavlin 13 my @arr;
570 dpavlin 372 my $rec_size = {};
571 dpavlin 13
572 dpavlin 372 while ($i <= $max) {
573 dpavlin 373 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
574 dpavlin 372 if ($rec_size) {
575     foreach my $f (keys %{ $rec_size }) {
576     $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
577     }
578 dpavlin 373 $log->debug("max set to $max");
579 dpavlin 372 undef $rec_size;
580     }
581 dpavlin 373 if (@v) {
582     push @arr, @v;
583     } else {
584 dpavlin 375 push @arr, '' if ($max > $i);
585 dpavlin 373 }
586 dpavlin 13 }
587    
588     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
589    
590     return @arr;
591     }
592    
593 dpavlin 15
594     =head2 get_data
595    
596     Returns value from record.
597    
598 dpavlin 371 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
599 dpavlin 15
600 dpavlin 368 Required arguments are:
601 dpavlin 15
602 dpavlin 368 =over 8
603 dpavlin 15
604 dpavlin 368 =item C<$rec>
605 dpavlin 15
606 dpavlin 368 record reference
607    
608     =item C<$f>
609    
610     field
611    
612     =item C<$sf>
613    
614     optional subfield
615    
616     =item C<$i>
617    
618 dpavlin 371 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
619 dpavlin 368
620     =item C<$found>
621    
622     optional variable that will be incremeted if preset
623    
624 dpavlin 371 =item C<$rec_size>
625 dpavlin 368
626 dpavlin 371 hash to hold maximum occurances of C<field^subfield> combinations
627 dpavlin 368 (which can be accessed using keys in same format)
628    
629     =back
630    
631 dpavlin 371 Returns value or empty string, updates C<$found> and C<rec_size>
632 dpavlin 368 if present.
633    
634 dpavlin 15 =cut
635    
636     sub get_data {
637     my $self = shift;
638    
639 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
640 dpavlin 15
641 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
642    
643     if (defined($$cache)) {
644 dpavlin 371 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
645 dpavlin 368 }
646    
647     return '' unless ($$rec->{$f}->[$i]);
648    
649     {
650 dpavlin 15 no strict 'refs';
651 dpavlin 368 if (defined($sf)) {
652     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
653 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
654 dpavlin 368 } else {
655 dpavlin 15 $$found++ if (defined($$found));
656 dpavlin 368 # it still might have subfields, just
657     # not specified, so we'll dump some debug info
658 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
659     my $out;
660     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
661 dpavlin 436 my $v = $$rec->{$f}->[$i]->{$k};
662     $out .= '$' . $k .':' . $v if ($v);
663 dpavlin 15 }
664     return $out;
665     } else {
666     return $$rec->{$f}->[$i];
667     }
668     }
669     }
670     }
671    
672    
673     =head2 apply_format
674    
675     Apply format specified in tag with C<format_name="name"> and
676     C<format_delimiter=";;">.
677    
678     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
679    
680     Formats can contain C<lookup{...}> if you need them.
681    
682     =cut
683    
684     sub apply_format {
685     my $self = shift;
686    
687     my ($name,$delimiter,$data) = @_;
688    
689     my $log = $self->_get_logger();
690    
691     if (! $self->{'import_xml'}->{'format'}->{$name}) {
692     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
693     return $data;
694     }
695    
696     $log->warn("no delimiter for format $name") if (! $delimiter);
697    
698     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
699    
700     my @data = split(/\Q$delimiter\E/, $data);
701    
702     my $out = sprintf($format, @data);
703     $log->debug("using format $name [$format] on $data to produce: $out");
704    
705     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
706 dpavlin 31 return $self->{'lookup'}->lookup($out);
707 dpavlin 15 } else {
708     return $out;
709     }
710    
711     }
712    
713 dpavlin 13 =head2 sort_arr
714    
715     Sort array ignoring case and html in data
716    
717     my @sorted = $webpac->sort_arr(@unsorted);
718    
719     =cut
720    
721     sub sort_arr {
722     my $self = shift;
723    
724     my $log = $self->_get_logger();
725    
726     # FIXME add Schwartzian Transformation?
727    
728     my @sorted = sort {
729     $a =~ s#<[^>]+/*>##;
730     $b =~ s#<[^>]+/*>##;
731     lc($b) cmp lc($a)
732     } @_;
733     $log->debug("sorted values: ",sub { join(", ",@sorted) });
734    
735     return @sorted;
736     }
737    
738    
739 dpavlin 15 =head1 INTERNAL METHODS
740    
741 dpavlin 13 =head2 _sort_by_order
742    
743     Sort xml tags data structure accoding to C<order=""> attribute.
744    
745     =cut
746    
747     sub _sort_by_order {
748     my $self = shift;
749    
750     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
751     $self->{'import_xml'}->{'indexer'}->{$a};
752     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
753     $self->{'import_xml'}->{'indexer'}->{$b};
754    
755     return $va <=> $vb;
756     }
757    
758     =head2 _x
759    
760 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
761     specific encoding (optinally specified using C<code_page> to C<new>
762     constructor).
763 dpavlin 13
764     my $text = $n->_x('normalize text string');
765    
766     This is a stub so that other modules doesn't have to implement it.
767    
768     =cut
769    
770     sub _x {
771     my $self = shift;
772     return shift;
773     }
774    
775    
776 dpavlin 10 =head1 AUTHOR
777    
778     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
779    
780     =head1 COPYRIGHT & LICENSE
781    
782     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
783    
784     This program is free software; you can redistribute it and/or modify it
785     under the same terms as Perl itself.
786    
787     =cut
788    
789 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26