/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 433 - (hide annotations)
Mon Apr 17 16:01:12 2006 UTC (18 years ago) by dpavlin
File size: 18168 byte(s)
 r524@llin:  dpavlin | 2006-04-17 18:01:04 +0200
 added all_tags() to get sorted list of all tags in input xml

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 433 Version 0.09
17 dpavlin 10
18     =cut
19    
20 dpavlin 433 our $VERSION = '0.09';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 433 =head2 all_tags
141 dpavlin 13
142 dpavlin 433 Returns all tags in document in specified order
143    
144     my $sorted_tags = $self->all_tags();
145    
146     =cut
147    
148     sub all_tags {
149     my $self = shift;
150    
151     if (! $self->{_tags_by_order}) {
152    
153     my $log = $self->_get_logger;
154     # sanity check
155     $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
156    
157     my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
158     $log->debug("unsorted tags: " . join(", ", @tags));
159    
160     @tags = sort { $self->_sort_by_order } @tags;
161    
162     $log->debug("sorted tags: " . join(",", @tags) );
163    
164     $self->{_tags_by_order} = \@tags;
165     }
166    
167     return $self->{_tags_by_order};
168     }
169    
170    
171    
172 dpavlin 13 =head2 data_structure
173    
174     Create in-memory data structure which represents normalized layout from
175     C<conf/normalize/*.xml>.
176    
177     This structures are used to produce output.
178    
179 dpavlin 70 my $ds = $webpac->data_structure($rec);
180 dpavlin 13
181     =cut
182    
183     sub data_structure {
184     my $self = shift;
185    
186     my $log = $self->_get_logger();
187    
188     my $rec = shift;
189     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
190    
191 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
192    
193 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
194 dpavlin 125
195 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
196 dpavlin 125
197 dpavlin 13 my $cache_file;
198    
199 dpavlin 18 if ($self->{'db'}) {
200 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
201 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
202     return $ds if ($ds);
203 dpavlin 29 $log->debug("cache miss, creating");
204 dpavlin 13 }
205    
206 dpavlin 433 my $tags = $self->all_tags();
207 dpavlin 13
208 dpavlin 433 $log->debug("tags: ",sub { join(", ",@{ $tags }) });
209    
210 dpavlin 70 my $ds;
211 dpavlin 13
212 dpavlin 433 foreach my $field (@{ $tags }) {
213 dpavlin 13
214     my $row;
215    
216     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
217    
218     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
219 dpavlin 38 my $format;
220 dpavlin 13
221 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
222     $format = $tag->{'value'} || $tag->{'content'};
223    
224 dpavlin 13 my @v;
225     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
226 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'fill_in');
227 dpavlin 13 } else {
228 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'parse');
229 dpavlin 13 }
230 dpavlin 364 if (! @v) {
231     $log->debug("$field <",$self->{tag},"> format: $format no values");
232 dpavlin 373 next;
233 dpavlin 364 } else {
234     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
235     }
236 dpavlin 13
237     if ($tag->{'sort'}) {
238     @v = $self->sort_arr(@v);
239     }
240    
241     # use format?
242     if ($tag->{'format_name'}) {
243     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
244     }
245    
246     # delimiter will join repeatable fields
247     if ($tag->{'delimiter'}) {
248     @v = ( join($tag->{'delimiter'}, @v) );
249     }
250    
251     # default types
252 dpavlin 74 my @types = qw(display search);
253 dpavlin 13 # override by type attribute
254     @types = ( $tag->{'type'} ) if ($tag->{'type'});
255    
256     foreach my $type (@types) {
257     # append to previous line?
258 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
259 dpavlin 13 if ($tag->{'append'}) {
260    
261     # I will delimit appended part with
262     # delimiter (or ,)
263     my $d = $tag->{'delimiter'};
264     # default delimiter
265     $d ||= " ";
266    
267     my $last = pop @{$row->{$type}};
268     $d = "" if (! $last);
269     $last .= $d . join($d, @v);
270     push @{$row->{$type}}, $last;
271    
272     } else {
273     push @{$row->{$type}}, @v;
274     }
275     }
276    
277    
278     }
279    
280     if ($row) {
281     $row->{'tag'} = $field;
282    
283     # TODO: name_sigular, name_plural
284     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
285 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
286 dpavlin 13
287     # post-sort all values in field
288     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
289     $log->warn("sort at field tag not implemented");
290     }
291    
292 dpavlin 70 $ds->{$row_name} = $row;
293 dpavlin 13
294     $log->debug("row $field: ",sub { Dumper($row) });
295     }
296    
297     }
298    
299 dpavlin 22 $self->{'db'}->save_ds(
300 dpavlin 219 id => $id,
301 dpavlin 70 ds => $ds,
302 dpavlin 219 prefix => $self->{prefix},
303 dpavlin 18 ) if ($self->{'db'});
304 dpavlin 13
305 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
306 dpavlin 29
307 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
308 dpavlin 13
309 dpavlin 70 return $ds;
310    
311 dpavlin 13 }
312    
313     =head2 parse
314    
315     Perform smart parsing of string, skipping delimiters for fields which aren't
316     defined. It can also eval code in format starting with C<eval{...}> and
317     return output or nothing depending on eval code.
318    
319     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
320    
321 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
322    
323     filter{name_of_filter}
324    
325     but, filters can also have variable number of parametars like this:
326    
327     filter{name_of_filter(param,param,param)}
328    
329 dpavlin 13 =cut
330    
331 dpavlin 261 my $warn_once;
332    
333 dpavlin 13 sub parse {
334     my $self = shift;
335    
336 dpavlin 371 my ($rec, $format_utf8, $i, $rec_size) = @_;
337 dpavlin 13
338     return if (! $format_utf8);
339    
340     my $log = $self->_get_logger();
341    
342     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
343    
344     $i = 0 if (! $i);
345    
346     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
347    
348     my @out;
349    
350 dpavlin 340 $log->debug("format: $format [$i]");
351 dpavlin 13
352     my $eval_code;
353     # remove eval{...} from beginning
354     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
355    
356     my $filter_name;
357     # remove filter{...} from beginning
358     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
359    
360 dpavlin 317 # did we found any (att all) field from format in row?
361 dpavlin 340 my $found_any;
362 dpavlin 317 # prefix before first field which we preserve it $found_any
363 dpavlin 13 my $prefix;
364    
365 dpavlin 317 my $f_step = 1;
366    
367 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
368    
369     my $del = $1 || '';
370 dpavlin 317 $prefix = $del if ($f_step == 1);
371 dpavlin 13
372 dpavlin 340 my $fld_type = lc($2);
373    
374 dpavlin 13 # repeatable index
375     my $r = $i;
376 dpavlin 340 if ($fld_type eq 's') {
377     if ($found_any->{'v'}) {
378     $r = 0;
379     } else {
380     return;
381     }
382     }
383 dpavlin 13
384     my $found = 0;
385 dpavlin 371 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
386 dpavlin 13
387     if ($found) {
388 dpavlin 340 $found_any->{$fld_type} += $found;
389 dpavlin 317
390     # we will skip delimiter before first occurence of field!
391 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
392 dpavlin 13 push @out, $tmp;
393     }
394 dpavlin 317 $f_step++;
395 dpavlin 13 }
396    
397 dpavlin 340 # test if any fields found?
398     return if (! $found_any->{'v'} && ! $found_any->{'s'});
399 dpavlin 13
400     my $out = join('',@out);
401    
402     if ($out) {
403     # add rest of format (suffix)
404     $out .= $format;
405    
406     # add prefix if not there
407     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
408    
409     $log->debug("result: $out");
410     }
411    
412     if ($eval_code) {
413     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
414     $log->debug("about to eval{$eval} format: $out");
415     return if (! $self->_eval($eval));
416     }
417    
418 dpavlin 260 if ($filter_name) {
419     my @filter_args;
420     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
421     @filter_args = split(/,/, $2);
422     }
423     if ($self->{'filter'}->{$filter_name}) {
424     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
425     unshift @filter_args, $out;
426     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
427     return unless(defined($out));
428     $log->debug("filter result: $out");
429 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
430 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
431 dpavlin 261 $warn_once->{$filter_name}++;
432 dpavlin 260 }
433 dpavlin 13 }
434    
435     return $out;
436     }
437    
438 dpavlin 15 =head2 fill_in
439    
440     Workhourse of all: takes record from in-memory structure of database and
441     strings with placeholders and returns string or array of with substituted
442     values from record.
443    
444     my $text = $webpac->fill_in($rec,'v250^a');
445    
446     Optional argument is ordinal number for repeatable fields. By default,
447     it's assume to be first repeatable field (fields are perl array, so first
448     element is 0).
449     Following example will read second value from repeatable field.
450    
451     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
452    
453     This function B<does not> perform parsing of format to inteligenty skip
454     delimiters before fields which aren't used.
455    
456     This method will automatically decode UTF-8 string to local code page
457     if needed.
458    
459 dpavlin 371 There is optional parametar C<$record_size> which can be used to get sizes of
460     all C<field^subfield> combinations in this format.
461    
462     my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
463    
464 dpavlin 15 =cut
465    
466     sub fill_in {
467     my $self = shift;
468    
469     my $log = $self->_get_logger();
470    
471 dpavlin 371 my ($rec,$format,$i,$rec_size) = @_;
472    
473     $log->logconfess("need data record") unless ($rec);
474     $log->logconfess("need format to parse") unless($format);
475    
476 dpavlin 15 # iteration (for repeatable fields)
477 dpavlin 371 $i ||= 0;
478 dpavlin 15
479     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
480    
481     # FIXME remove for speedup?
482     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
483    
484     if (utf8::is_utf8($format)) {
485     $format = $self->_x($format);
486     }
487    
488     my $found = 0;
489 dpavlin 333 my $just_single = 1;
490 dpavlin 15
491     my $eval_code;
492     # remove eval{...} from beginning
493     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
494    
495     my $filter_name;
496     # remove filter{...} from beginning
497     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
498    
499 dpavlin 397 {
500     # fix warnings
501     no warnings 'uninitialized';
502 dpavlin 364
503 dpavlin 397 # do actual replacement of placeholders
504     # repeatable fields
505     if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
506     $just_single = 0;
507     }
508    
509     # non-repeatable fields
510     if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
511     return if ($i > 0 && $just_single);
512     }
513 dpavlin 333 }
514 dpavlin 15
515     if ($found) {
516     $log->debug("format: $format");
517     if ($eval_code) {
518     my $eval = $self->fill_in($rec,$eval_code,$i);
519     return if (! $self->_eval($eval));
520     }
521     if ($filter_name && $self->{'filter'}->{$filter_name}) {
522     $log->debug("filter '$filter_name' for $format");
523     $format = $self->{'filter'}->{$filter_name}->($format);
524     return unless(defined($format));
525     $log->debug("filter result: $format");
526     }
527     # do we have lookups?
528     if ($self->{'lookup'}) {
529 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
530 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
531 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
532 dpavlin 252 return @lookup;
533 dpavlin 31 } else {
534     $log->warn("Have lookup object but can't invoke lookup method");
535     }
536 dpavlin 15 } else {
537     return $format;
538     }
539     } else {
540     return;
541     }
542     }
543    
544    
545 dpavlin 372 =head2 _rec_to_arr
546 dpavlin 13
547 dpavlin 372 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
548 dpavlin 13 for fields which have lookups, so they shouldn't be parsed but rather
549 dpavlin 372 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
550 dpavlin 13
551 dpavlin 372 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
552 dpavlin 13
553     =cut
554    
555 dpavlin 372 sub _rec_to_arr {
556 dpavlin 13 my $self = shift;
557    
558 dpavlin 372 my ($rec, $format_utf8, $code) = @_;
559 dpavlin 13
560     my $log = $self->_get_logger();
561    
562     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
563     return if (! $format_utf8);
564    
565 dpavlin 373 $log->debug("using $code on $format_utf8");
566    
567 dpavlin 13 my $i = 0;
568 dpavlin 372 my $max = 0;
569 dpavlin 13 my @arr;
570 dpavlin 372 my $rec_size = {};
571 dpavlin 13
572 dpavlin 372 while ($i <= $max) {
573 dpavlin 373 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
574 dpavlin 372 if ($rec_size) {
575     foreach my $f (keys %{ $rec_size }) {
576     $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
577     }
578 dpavlin 373 $log->debug("max set to $max");
579 dpavlin 372 undef $rec_size;
580     }
581 dpavlin 373 if (@v) {
582     push @arr, @v;
583     } else {
584 dpavlin 375 push @arr, '' if ($max > $i);
585 dpavlin 373 }
586 dpavlin 13 }
587    
588     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
589    
590     return @arr;
591     }
592    
593 dpavlin 15
594     =head2 get_data
595    
596     Returns value from record.
597    
598 dpavlin 371 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
599 dpavlin 15
600 dpavlin 368 Required arguments are:
601 dpavlin 15
602 dpavlin 368 =over 8
603 dpavlin 15
604 dpavlin 368 =item C<$rec>
605 dpavlin 15
606 dpavlin 368 record reference
607    
608     =item C<$f>
609    
610     field
611    
612     =item C<$sf>
613    
614     optional subfield
615    
616     =item C<$i>
617    
618 dpavlin 371 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
619 dpavlin 368
620     =item C<$found>
621    
622     optional variable that will be incremeted if preset
623    
624 dpavlin 371 =item C<$rec_size>
625 dpavlin 368
626 dpavlin 371 hash to hold maximum occurances of C<field^subfield> combinations
627 dpavlin 368 (which can be accessed using keys in same format)
628    
629     =back
630    
631 dpavlin 371 Returns value or empty string, updates C<$found> and C<rec_size>
632 dpavlin 368 if present.
633    
634 dpavlin 15 =cut
635    
636     sub get_data {
637     my $self = shift;
638    
639 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
640 dpavlin 15
641 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
642    
643     if (defined($$cache)) {
644 dpavlin 371 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
645 dpavlin 368 }
646    
647     return '' unless ($$rec->{$f}->[$i]);
648    
649     {
650 dpavlin 15 no strict 'refs';
651 dpavlin 368 if (defined($sf)) {
652     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
653 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
654 dpavlin 368 } else {
655 dpavlin 15 $$found++ if (defined($$found));
656 dpavlin 368 # it still might have subfields, just
657     # not specified, so we'll dump some debug info
658 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
659     my $out;
660     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
661 dpavlin 368 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
662 dpavlin 15 }
663     return $out;
664     } else {
665     return $$rec->{$f}->[$i];
666     }
667     }
668     }
669     }
670    
671    
672     =head2 apply_format
673    
674     Apply format specified in tag with C<format_name="name"> and
675     C<format_delimiter=";;">.
676    
677     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
678    
679     Formats can contain C<lookup{...}> if you need them.
680    
681     =cut
682    
683     sub apply_format {
684     my $self = shift;
685    
686     my ($name,$delimiter,$data) = @_;
687    
688     my $log = $self->_get_logger();
689    
690     if (! $self->{'import_xml'}->{'format'}->{$name}) {
691     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
692     return $data;
693     }
694    
695     $log->warn("no delimiter for format $name") if (! $delimiter);
696    
697     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
698    
699     my @data = split(/\Q$delimiter\E/, $data);
700    
701     my $out = sprintf($format, @data);
702     $log->debug("using format $name [$format] on $data to produce: $out");
703    
704     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
705 dpavlin 31 return $self->{'lookup'}->lookup($out);
706 dpavlin 15 } else {
707     return $out;
708     }
709    
710     }
711    
712 dpavlin 13 =head2 sort_arr
713    
714     Sort array ignoring case and html in data
715    
716     my @sorted = $webpac->sort_arr(@unsorted);
717    
718     =cut
719    
720     sub sort_arr {
721     my $self = shift;
722    
723     my $log = $self->_get_logger();
724    
725     # FIXME add Schwartzian Transformation?
726    
727     my @sorted = sort {
728     $a =~ s#<[^>]+/*>##;
729     $b =~ s#<[^>]+/*>##;
730     lc($b) cmp lc($a)
731     } @_;
732     $log->debug("sorted values: ",sub { join(", ",@sorted) });
733    
734     return @sorted;
735     }
736    
737    
738 dpavlin 15 =head1 INTERNAL METHODS
739    
740 dpavlin 13 =head2 _sort_by_order
741    
742     Sort xml tags data structure accoding to C<order=""> attribute.
743    
744     =cut
745    
746     sub _sort_by_order {
747     my $self = shift;
748    
749     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
750     $self->{'import_xml'}->{'indexer'}->{$a};
751     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
752     $self->{'import_xml'}->{'indexer'}->{$b};
753    
754     return $va <=> $vb;
755     }
756    
757     =head2 _x
758    
759 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
760     specific encoding (optinally specified using C<code_page> to C<new>
761     constructor).
762 dpavlin 13
763     my $text = $n->_x('normalize text string');
764    
765     This is a stub so that other modules doesn't have to implement it.
766    
767     =cut
768    
769     sub _x {
770     my $self = shift;
771     return shift;
772     }
773    
774    
775 dpavlin 10 =head1 AUTHOR
776    
777     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
778    
779     =head1 COPYRIGHT & LICENSE
780    
781     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
782    
783     This program is free software; you can redistribute it and/or modify it
784     under the same terms as Perl itself.
785    
786     =cut
787    
788 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26