/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 375 - (hide annotations)
Sun Jan 8 22:21:24 2006 UTC (17 years ago) by dpavlin
File size: 17693 byte(s)
 r417@llin:  dpavlin | 2006-01-08 23:21:35 +0100
 fixed another corner-case

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 317 Version 0.08
17 dpavlin 10
18     =cut
19    
20 dpavlin 317 our $VERSION = '0.08';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 13
141     =head2 data_structure
142    
143     Create in-memory data structure which represents normalized layout from
144     C<conf/normalize/*.xml>.
145    
146     This structures are used to produce output.
147    
148 dpavlin 70 my $ds = $webpac->data_structure($rec);
149 dpavlin 13
150     =cut
151    
152     sub data_structure {
153     my $self = shift;
154    
155     my $log = $self->_get_logger();
156    
157     my $rec = shift;
158     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159    
160 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
161    
162 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163 dpavlin 125
164 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
165 dpavlin 125
166 dpavlin 13 my $cache_file;
167    
168 dpavlin 18 if ($self->{'db'}) {
169 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171     return $ds if ($ds);
172 dpavlin 29 $log->debug("cache miss, creating");
173 dpavlin 13 }
174    
175     my @sorted_tags;
176     if ($self->{tags_by_order}) {
177     @sorted_tags = @{$self->{tags_by_order}};
178     } else {
179     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180     $self->{tags_by_order} = \@sorted_tags;
181     }
182    
183 dpavlin 70 my $ds;
184 dpavlin 13
185     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
186    
187     foreach my $field (@sorted_tags) {
188    
189     my $row;
190    
191     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192    
193     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194 dpavlin 38 my $format;
195 dpavlin 13
196 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197     $format = $tag->{'value'} || $tag->{'content'};
198    
199 dpavlin 13 my @v;
200     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'fill_in');
202 dpavlin 13 } else {
203 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'parse');
204 dpavlin 13 }
205 dpavlin 364 if (! @v) {
206     $log->debug("$field <",$self->{tag},"> format: $format no values");
207 dpavlin 373 next;
208 dpavlin 364 } else {
209     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210     }
211 dpavlin 13
212     if ($tag->{'sort'}) {
213     @v = $self->sort_arr(@v);
214     }
215    
216     # use format?
217     if ($tag->{'format_name'}) {
218     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219     }
220    
221     # delimiter will join repeatable fields
222     if ($tag->{'delimiter'}) {
223     @v = ( join($tag->{'delimiter'}, @v) );
224     }
225    
226     # default types
227 dpavlin 74 my @types = qw(display search);
228 dpavlin 13 # override by type attribute
229     @types = ( $tag->{'type'} ) if ($tag->{'type'});
230    
231     foreach my $type (@types) {
232     # append to previous line?
233 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 dpavlin 13 if ($tag->{'append'}) {
235    
236     # I will delimit appended part with
237     # delimiter (or ,)
238     my $d = $tag->{'delimiter'};
239     # default delimiter
240     $d ||= " ";
241    
242     my $last = pop @{$row->{$type}};
243     $d = "" if (! $last);
244     $last .= $d . join($d, @v);
245     push @{$row->{$type}}, $last;
246    
247     } else {
248     push @{$row->{$type}}, @v;
249     }
250     }
251    
252    
253     }
254    
255     if ($row) {
256     $row->{'tag'} = $field;
257    
258     # TODO: name_sigular, name_plural
259     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
261 dpavlin 13
262     # post-sort all values in field
263     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264     $log->warn("sort at field tag not implemented");
265     }
266    
267 dpavlin 70 $ds->{$row_name} = $row;
268 dpavlin 13
269     $log->debug("row $field: ",sub { Dumper($row) });
270     }
271    
272     }
273    
274 dpavlin 22 $self->{'db'}->save_ds(
275 dpavlin 219 id => $id,
276 dpavlin 70 ds => $ds,
277 dpavlin 219 prefix => $self->{prefix},
278 dpavlin 18 ) if ($self->{'db'});
279 dpavlin 13
280 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
281 dpavlin 29
282 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
283 dpavlin 13
284 dpavlin 70 return $ds;
285    
286 dpavlin 13 }
287    
288     =head2 parse
289    
290     Perform smart parsing of string, skipping delimiters for fields which aren't
291     defined. It can also eval code in format starting with C<eval{...}> and
292     return output or nothing depending on eval code.
293    
294     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295    
296 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
297    
298     filter{name_of_filter}
299    
300     but, filters can also have variable number of parametars like this:
301    
302     filter{name_of_filter(param,param,param)}
303    
304 dpavlin 13 =cut
305    
306 dpavlin 261 my $warn_once;
307    
308 dpavlin 13 sub parse {
309     my $self = shift;
310    
311 dpavlin 371 my ($rec, $format_utf8, $i, $rec_size) = @_;
312 dpavlin 13
313     return if (! $format_utf8);
314    
315     my $log = $self->_get_logger();
316    
317     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318    
319     $i = 0 if (! $i);
320    
321     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322    
323     my @out;
324    
325 dpavlin 340 $log->debug("format: $format [$i]");
326 dpavlin 13
327     my $eval_code;
328     # remove eval{...} from beginning
329     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330    
331     my $filter_name;
332     # remove filter{...} from beginning
333     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334    
335 dpavlin 317 # did we found any (att all) field from format in row?
336 dpavlin 340 my $found_any;
337 dpavlin 317 # prefix before first field which we preserve it $found_any
338 dpavlin 13 my $prefix;
339    
340 dpavlin 317 my $f_step = 1;
341    
342 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
343    
344     my $del = $1 || '';
345 dpavlin 317 $prefix = $del if ($f_step == 1);
346 dpavlin 13
347 dpavlin 340 my $fld_type = lc($2);
348    
349 dpavlin 13 # repeatable index
350     my $r = $i;
351 dpavlin 340 if ($fld_type eq 's') {
352     if ($found_any->{'v'}) {
353     $r = 0;
354     } else {
355     return;
356     }
357     }
358 dpavlin 13
359     my $found = 0;
360 dpavlin 371 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
361 dpavlin 13
362     if ($found) {
363 dpavlin 340 $found_any->{$fld_type} += $found;
364 dpavlin 317
365     # we will skip delimiter before first occurence of field!
366 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
367 dpavlin 13 push @out, $tmp;
368     }
369 dpavlin 317 $f_step++;
370 dpavlin 13 }
371    
372 dpavlin 340 # test if any fields found?
373     return if (! $found_any->{'v'} && ! $found_any->{'s'});
374 dpavlin 13
375     my $out = join('',@out);
376    
377     if ($out) {
378     # add rest of format (suffix)
379     $out .= $format;
380    
381     # add prefix if not there
382     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383    
384     $log->debug("result: $out");
385     }
386    
387     if ($eval_code) {
388     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389     $log->debug("about to eval{$eval} format: $out");
390     return if (! $self->_eval($eval));
391     }
392    
393 dpavlin 260 if ($filter_name) {
394     my @filter_args;
395     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396     @filter_args = split(/,/, $2);
397     }
398     if ($self->{'filter'}->{$filter_name}) {
399     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400     unshift @filter_args, $out;
401     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402     return unless(defined($out));
403     $log->debug("filter result: $out");
404 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
405 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
406 dpavlin 261 $warn_once->{$filter_name}++;
407 dpavlin 260 }
408 dpavlin 13 }
409    
410     return $out;
411     }
412    
413 dpavlin 15 =head2 fill_in
414    
415     Workhourse of all: takes record from in-memory structure of database and
416     strings with placeholders and returns string or array of with substituted
417     values from record.
418    
419     my $text = $webpac->fill_in($rec,'v250^a');
420    
421     Optional argument is ordinal number for repeatable fields. By default,
422     it's assume to be first repeatable field (fields are perl array, so first
423     element is 0).
424     Following example will read second value from repeatable field.
425    
426     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
427    
428     This function B<does not> perform parsing of format to inteligenty skip
429     delimiters before fields which aren't used.
430    
431     This method will automatically decode UTF-8 string to local code page
432     if needed.
433    
434 dpavlin 371 There is optional parametar C<$record_size> which can be used to get sizes of
435     all C<field^subfield> combinations in this format.
436    
437     my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
438    
439 dpavlin 15 =cut
440    
441     sub fill_in {
442     my $self = shift;
443    
444     my $log = $self->_get_logger();
445    
446 dpavlin 371 my ($rec,$format,$i,$rec_size) = @_;
447    
448     $log->logconfess("need data record") unless ($rec);
449     $log->logconfess("need format to parse") unless($format);
450    
451 dpavlin 15 # iteration (for repeatable fields)
452 dpavlin 371 $i ||= 0;
453 dpavlin 15
454     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
455    
456     # FIXME remove for speedup?
457     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458    
459     if (utf8::is_utf8($format)) {
460     $format = $self->_x($format);
461     }
462    
463     my $found = 0;
464 dpavlin 333 my $just_single = 1;
465 dpavlin 15
466     my $eval_code;
467     # remove eval{...} from beginning
468     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
469    
470     my $filter_name;
471     # remove filter{...} from beginning
472     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
473    
474     # do actual replacement of placeholders
475     # repeatable fields
476 dpavlin 371 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
477 dpavlin 333 $just_single = 0;
478     }
479 dpavlin 364
480 dpavlin 15 # non-repeatable fields
481 dpavlin 371 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
482 dpavlin 333 return if ($i > 0 && $just_single);
483     }
484 dpavlin 15
485     if ($found) {
486     $log->debug("format: $format");
487     if ($eval_code) {
488     my $eval = $self->fill_in($rec,$eval_code,$i);
489     return if (! $self->_eval($eval));
490     }
491     if ($filter_name && $self->{'filter'}->{$filter_name}) {
492     $log->debug("filter '$filter_name' for $format");
493     $format = $self->{'filter'}->{$filter_name}->($format);
494     return unless(defined($format));
495     $log->debug("filter result: $format");
496     }
497     # do we have lookups?
498     if ($self->{'lookup'}) {
499 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
500 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
501 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
502 dpavlin 252 return @lookup;
503 dpavlin 31 } else {
504     $log->warn("Have lookup object but can't invoke lookup method");
505     }
506 dpavlin 15 } else {
507     return $format;
508     }
509     } else {
510     return;
511     }
512     }
513    
514    
515 dpavlin 372 =head2 _rec_to_arr
516 dpavlin 13
517 dpavlin 372 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
518 dpavlin 13 for fields which have lookups, so they shouldn't be parsed but rather
519 dpavlin 372 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
520 dpavlin 13
521 dpavlin 372 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
522 dpavlin 13
523     =cut
524    
525 dpavlin 372 sub _rec_to_arr {
526 dpavlin 13 my $self = shift;
527    
528 dpavlin 372 my ($rec, $format_utf8, $code) = @_;
529 dpavlin 13
530     my $log = $self->_get_logger();
531    
532     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
533     return if (! $format_utf8);
534    
535 dpavlin 373 $log->debug("using $code on $format_utf8");
536    
537 dpavlin 13 my $i = 0;
538 dpavlin 372 my $max = 0;
539 dpavlin 13 my @arr;
540 dpavlin 372 my $rec_size = {};
541 dpavlin 13
542 dpavlin 372 while ($i <= $max) {
543 dpavlin 373 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
544 dpavlin 372 if ($rec_size) {
545     foreach my $f (keys %{ $rec_size }) {
546     $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
547     }
548 dpavlin 373 $log->debug("max set to $max");
549 dpavlin 372 undef $rec_size;
550     }
551 dpavlin 373 if (@v) {
552     push @arr, @v;
553     } else {
554 dpavlin 375 push @arr, '' if ($max > $i);
555 dpavlin 373 }
556 dpavlin 13 }
557    
558     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
559    
560     return @arr;
561     }
562    
563 dpavlin 15
564     =head2 get_data
565    
566     Returns value from record.
567    
568 dpavlin 371 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
569 dpavlin 15
570 dpavlin 368 Required arguments are:
571 dpavlin 15
572 dpavlin 368 =over 8
573 dpavlin 15
574 dpavlin 368 =item C<$rec>
575 dpavlin 15
576 dpavlin 368 record reference
577    
578     =item C<$f>
579    
580     field
581    
582     =item C<$sf>
583    
584     optional subfield
585    
586     =item C<$i>
587    
588 dpavlin 371 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
589 dpavlin 368
590     =item C<$found>
591    
592     optional variable that will be incremeted if preset
593    
594 dpavlin 371 =item C<$rec_size>
595 dpavlin 368
596 dpavlin 371 hash to hold maximum occurances of C<field^subfield> combinations
597 dpavlin 368 (which can be accessed using keys in same format)
598    
599     =back
600    
601 dpavlin 371 Returns value or empty string, updates C<$found> and C<rec_size>
602 dpavlin 368 if present.
603    
604 dpavlin 15 =cut
605    
606     sub get_data {
607     my $self = shift;
608    
609 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
610 dpavlin 15
611 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
612    
613     if (defined($$cache)) {
614 dpavlin 371 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
615 dpavlin 368 }
616    
617     return '' unless ($$rec->{$f}->[$i]);
618    
619     {
620 dpavlin 15 no strict 'refs';
621 dpavlin 368 if (defined($sf)) {
622     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
623 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
624 dpavlin 368 } else {
625 dpavlin 15 $$found++ if (defined($$found));
626 dpavlin 368 # it still might have subfields, just
627     # not specified, so we'll dump some debug info
628 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
629     my $out;
630     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
631 dpavlin 368 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
632 dpavlin 15 }
633     return $out;
634     } else {
635     return $$rec->{$f}->[$i];
636     }
637     }
638     }
639     }
640    
641    
642     =head2 apply_format
643    
644     Apply format specified in tag with C<format_name="name"> and
645     C<format_delimiter=";;">.
646    
647     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
648    
649     Formats can contain C<lookup{...}> if you need them.
650    
651     =cut
652    
653     sub apply_format {
654     my $self = shift;
655    
656     my ($name,$delimiter,$data) = @_;
657    
658     my $log = $self->_get_logger();
659    
660     if (! $self->{'import_xml'}->{'format'}->{$name}) {
661     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
662     return $data;
663     }
664    
665     $log->warn("no delimiter for format $name") if (! $delimiter);
666    
667     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
668    
669     my @data = split(/\Q$delimiter\E/, $data);
670    
671     my $out = sprintf($format, @data);
672     $log->debug("using format $name [$format] on $data to produce: $out");
673    
674     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
675 dpavlin 31 return $self->{'lookup'}->lookup($out);
676 dpavlin 15 } else {
677     return $out;
678     }
679    
680     }
681    
682 dpavlin 13 =head2 sort_arr
683    
684     Sort array ignoring case and html in data
685    
686     my @sorted = $webpac->sort_arr(@unsorted);
687    
688     =cut
689    
690     sub sort_arr {
691     my $self = shift;
692    
693     my $log = $self->_get_logger();
694    
695     # FIXME add Schwartzian Transformation?
696    
697     my @sorted = sort {
698     $a =~ s#<[^>]+/*>##;
699     $b =~ s#<[^>]+/*>##;
700     lc($b) cmp lc($a)
701     } @_;
702     $log->debug("sorted values: ",sub { join(", ",@sorted) });
703    
704     return @sorted;
705     }
706    
707    
708 dpavlin 15 =head1 INTERNAL METHODS
709    
710 dpavlin 13 =head2 _sort_by_order
711    
712     Sort xml tags data structure accoding to C<order=""> attribute.
713    
714     =cut
715    
716     sub _sort_by_order {
717     my $self = shift;
718    
719     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
720     $self->{'import_xml'}->{'indexer'}->{$a};
721     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
722     $self->{'import_xml'}->{'indexer'}->{$b};
723    
724     return $va <=> $vb;
725     }
726    
727     =head2 _x
728    
729 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
730     specific encoding (optinally specified using C<code_page> to C<new>
731     constructor).
732 dpavlin 13
733     my $text = $n->_x('normalize text string');
734    
735     This is a stub so that other modules doesn't have to implement it.
736    
737     =cut
738    
739     sub _x {
740     my $self = shift;
741     return shift;
742     }
743    
744    
745 dpavlin 10 =head1 AUTHOR
746    
747     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
748    
749     =head1 COPYRIGHT & LICENSE
750    
751     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
752    
753     This program is free software; you can redistribute it and/or modify it
754     under the same terms as Perl itself.
755    
756     =cut
757    
758 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26