/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 372 - (hide annotations)
Sun Jan 8 21:50:34 2006 UTC (18 years, 3 months ago) by dpavlin
File size: 17619 byte(s)
 r412@llin:  dpavlin | 2006-01-08 22:50:49 +0100
 more refactoring: joined paste_to_arr and fill_in_to_arr to _rec_to_arr

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 317 Version 0.08
17 dpavlin 10
18     =cut
19    
20 dpavlin 317 our $VERSION = '0.08';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 13
141     =head2 data_structure
142    
143     Create in-memory data structure which represents normalized layout from
144     C<conf/normalize/*.xml>.
145    
146     This structures are used to produce output.
147    
148 dpavlin 70 my $ds = $webpac->data_structure($rec);
149 dpavlin 13
150     =cut
151    
152     sub data_structure {
153     my $self = shift;
154    
155     my $log = $self->_get_logger();
156    
157     my $rec = shift;
158     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159    
160 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
161    
162 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163 dpavlin 125
164 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
165 dpavlin 125
166 dpavlin 13 my $cache_file;
167    
168 dpavlin 18 if ($self->{'db'}) {
169 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171     return $ds if ($ds);
172 dpavlin 29 $log->debug("cache miss, creating");
173 dpavlin 13 }
174    
175     my @sorted_tags;
176     if ($self->{tags_by_order}) {
177     @sorted_tags = @{$self->{tags_by_order}};
178     } else {
179     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180     $self->{tags_by_order} = \@sorted_tags;
181     }
182    
183 dpavlin 70 my $ds;
184 dpavlin 13
185     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
186    
187     foreach my $field (@sorted_tags) {
188    
189     my $row;
190    
191     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192    
193     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194 dpavlin 38 my $format;
195 dpavlin 13
196 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197     $format = $tag->{'value'} || $tag->{'content'};
198    
199 dpavlin 13 my @v;
200     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'fill_in');
202 dpavlin 13 } else {
203 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'parse');
204 dpavlin 13 }
205 dpavlin 364 if (! @v) {
206     $log->debug("$field <",$self->{tag},"> format: $format no values");
207     # next;
208     } else {
209     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210     }
211 dpavlin 13
212     if ($tag->{'sort'}) {
213     @v = $self->sort_arr(@v);
214     }
215    
216     # use format?
217     if ($tag->{'format_name'}) {
218     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219     }
220    
221     # delimiter will join repeatable fields
222     if ($tag->{'delimiter'}) {
223     @v = ( join($tag->{'delimiter'}, @v) );
224     }
225    
226     # default types
227 dpavlin 74 my @types = qw(display search);
228 dpavlin 13 # override by type attribute
229     @types = ( $tag->{'type'} ) if ($tag->{'type'});
230    
231     foreach my $type (@types) {
232     # append to previous line?
233 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 dpavlin 13 if ($tag->{'append'}) {
235    
236     # I will delimit appended part with
237     # delimiter (or ,)
238     my $d = $tag->{'delimiter'};
239     # default delimiter
240     $d ||= " ";
241    
242     my $last = pop @{$row->{$type}};
243     $d = "" if (! $last);
244     $last .= $d . join($d, @v);
245     push @{$row->{$type}}, $last;
246    
247     } else {
248     push @{$row->{$type}}, @v;
249     }
250     }
251    
252    
253     }
254    
255     if ($row) {
256     $row->{'tag'} = $field;
257    
258     # TODO: name_sigular, name_plural
259     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
261 dpavlin 13
262     # post-sort all values in field
263     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264     $log->warn("sort at field tag not implemented");
265     }
266    
267 dpavlin 70 $ds->{$row_name} = $row;
268 dpavlin 13
269     $log->debug("row $field: ",sub { Dumper($row) });
270     }
271    
272     }
273    
274 dpavlin 22 $self->{'db'}->save_ds(
275 dpavlin 219 id => $id,
276 dpavlin 70 ds => $ds,
277 dpavlin 219 prefix => $self->{prefix},
278 dpavlin 18 ) if ($self->{'db'});
279 dpavlin 13
280 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
281 dpavlin 29
282 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
283 dpavlin 13
284 dpavlin 70 return $ds;
285    
286 dpavlin 13 }
287    
288     =head2 parse
289    
290     Perform smart parsing of string, skipping delimiters for fields which aren't
291     defined. It can also eval code in format starting with C<eval{...}> and
292     return output or nothing depending on eval code.
293    
294     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295    
296 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
297    
298     filter{name_of_filter}
299    
300     but, filters can also have variable number of parametars like this:
301    
302     filter{name_of_filter(param,param,param)}
303    
304 dpavlin 13 =cut
305    
306 dpavlin 261 my $warn_once;
307    
308 dpavlin 13 sub parse {
309     my $self = shift;
310    
311 dpavlin 371 my ($rec, $format_utf8, $i, $rec_size) = @_;
312 dpavlin 13
313     return if (! $format_utf8);
314    
315     my $log = $self->_get_logger();
316    
317     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318    
319     $i = 0 if (! $i);
320    
321     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322    
323     my @out;
324    
325 dpavlin 340 $log->debug("format: $format [$i]");
326 dpavlin 13
327     my $eval_code;
328     # remove eval{...} from beginning
329     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330    
331     my $filter_name;
332     # remove filter{...} from beginning
333     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334    
335 dpavlin 317 # did we found any (att all) field from format in row?
336 dpavlin 340 my $found_any;
337 dpavlin 317 # prefix before first field which we preserve it $found_any
338 dpavlin 13 my $prefix;
339    
340 dpavlin 317 my $f_step = 1;
341    
342 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
343    
344     my $del = $1 || '';
345 dpavlin 317 $prefix = $del if ($f_step == 1);
346 dpavlin 13
347 dpavlin 340 my $fld_type = lc($2);
348    
349 dpavlin 13 # repeatable index
350     my $r = $i;
351 dpavlin 340 if ($fld_type eq 's') {
352     if ($found_any->{'v'}) {
353     $r = 0;
354     } else {
355     return;
356     }
357     }
358 dpavlin 13
359     my $found = 0;
360 dpavlin 371 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
361 dpavlin 13
362     if ($found) {
363 dpavlin 340 $found_any->{$fld_type} += $found;
364 dpavlin 317
365     # we will skip delimiter before first occurence of field!
366 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
367 dpavlin 13 push @out, $tmp;
368     }
369 dpavlin 317 $f_step++;
370 dpavlin 13 }
371    
372 dpavlin 340 # test if any fields found?
373     return if (! $found_any->{'v'} && ! $found_any->{'s'});
374 dpavlin 13
375     my $out = join('',@out);
376    
377     if ($out) {
378     # add rest of format (suffix)
379     $out .= $format;
380    
381     # add prefix if not there
382     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383    
384     $log->debug("result: $out");
385     }
386    
387     if ($eval_code) {
388     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389     $log->debug("about to eval{$eval} format: $out");
390     return if (! $self->_eval($eval));
391     }
392    
393 dpavlin 260 if ($filter_name) {
394     my @filter_args;
395     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396     @filter_args = split(/,/, $2);
397     }
398     if ($self->{'filter'}->{$filter_name}) {
399     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400     unshift @filter_args, $out;
401     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402     return unless(defined($out));
403     $log->debug("filter result: $out");
404 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
405 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
406 dpavlin 261 $warn_once->{$filter_name}++;
407 dpavlin 260 }
408 dpavlin 13 }
409    
410     return $out;
411     }
412    
413 dpavlin 15 =head2 fill_in
414    
415     Workhourse of all: takes record from in-memory structure of database and
416     strings with placeholders and returns string or array of with substituted
417     values from record.
418    
419     my $text = $webpac->fill_in($rec,'v250^a');
420    
421     Optional argument is ordinal number for repeatable fields. By default,
422     it's assume to be first repeatable field (fields are perl array, so first
423     element is 0).
424     Following example will read second value from repeatable field.
425    
426     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
427    
428     This function B<does not> perform parsing of format to inteligenty skip
429     delimiters before fields which aren't used.
430    
431     This method will automatically decode UTF-8 string to local code page
432     if needed.
433    
434 dpavlin 371 There is optional parametar C<$record_size> which can be used to get sizes of
435     all C<field^subfield> combinations in this format.
436    
437     my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
438    
439 dpavlin 15 =cut
440    
441     sub fill_in {
442     my $self = shift;
443    
444     my $log = $self->_get_logger();
445    
446 dpavlin 371 my ($rec,$format,$i,$rec_size) = @_;
447    
448     $log->logconfess("need data record") unless ($rec);
449     $log->logconfess("need format to parse") unless($format);
450    
451 dpavlin 15 # iteration (for repeatable fields)
452 dpavlin 371 $i ||= 0;
453 dpavlin 15
454     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
455    
456     # FIXME remove for speedup?
457     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458    
459     if (utf8::is_utf8($format)) {
460     $format = $self->_x($format);
461     }
462    
463     my $found = 0;
464 dpavlin 333 my $just_single = 1;
465 dpavlin 15
466     my $eval_code;
467     # remove eval{...} from beginning
468     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
469    
470     my $filter_name;
471     # remove filter{...} from beginning
472     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
473    
474     # do actual replacement of placeholders
475     # repeatable fields
476 dpavlin 371 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
477 dpavlin 333 $just_single = 0;
478     }
479 dpavlin 364
480 dpavlin 15 # non-repeatable fields
481 dpavlin 371 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
482 dpavlin 333 return if ($i > 0 && $just_single);
483     }
484 dpavlin 15
485     if ($found) {
486     $log->debug("format: $format");
487     if ($eval_code) {
488     my $eval = $self->fill_in($rec,$eval_code,$i);
489     return if (! $self->_eval($eval));
490     }
491     if ($filter_name && $self->{'filter'}->{$filter_name}) {
492     $log->debug("filter '$filter_name' for $format");
493     $format = $self->{'filter'}->{$filter_name}->($format);
494     return unless(defined($format));
495     $log->debug("filter result: $format");
496     }
497     # do we have lookups?
498     if ($self->{'lookup'}) {
499 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
500 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
501 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
502 dpavlin 252 return @lookup;
503 dpavlin 31 } else {
504     $log->warn("Have lookup object but can't invoke lookup method");
505     }
506 dpavlin 15 } else {
507     return $format;
508     }
509     } else {
510     return;
511     }
512     }
513    
514    
515 dpavlin 372 =head2 _rec_to_arr
516 dpavlin 13
517 dpavlin 372 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
518 dpavlin 13 for fields which have lookups, so they shouldn't be parsed but rather
519 dpavlin 372 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
520 dpavlin 13
521 dpavlin 372 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
522 dpavlin 13
523     =cut
524    
525 dpavlin 372 sub _rec_to_arr {
526 dpavlin 13 my $self = shift;
527    
528 dpavlin 372 my ($rec, $format_utf8, $code) = @_;
529 dpavlin 13
530     my $log = $self->_get_logger();
531    
532     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
533     return if (! $format_utf8);
534    
535     my $i = 0;
536 dpavlin 372 my $max = 0;
537 dpavlin 13 my @arr;
538 dpavlin 372 my $rec_size = {};
539 dpavlin 13
540 dpavlin 372 while ($i <= $max) {
541     my $v = $self->$code($rec,$format_utf8,$i++,\$rec_size) || next;
542 dpavlin 364 push @arr, $v;
543 dpavlin 372 if ($rec_size) {
544     foreach my $f (keys %{ $rec_size }) {
545     $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
546     }
547     warn "max set to $max, rec_size = ", Dumper($rec_size);
548     undef $rec_size;
549     }
550 dpavlin 13 }
551    
552     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
553    
554     return @arr;
555     }
556    
557 dpavlin 15
558     =head2 get_data
559    
560     Returns value from record.
561    
562 dpavlin 371 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
563 dpavlin 15
564 dpavlin 368 Required arguments are:
565 dpavlin 15
566 dpavlin 368 =over 8
567 dpavlin 15
568 dpavlin 368 =item C<$rec>
569 dpavlin 15
570 dpavlin 368 record reference
571    
572     =item C<$f>
573    
574     field
575    
576     =item C<$sf>
577    
578     optional subfield
579    
580     =item C<$i>
581    
582 dpavlin 371 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
583 dpavlin 368
584     =item C<$found>
585    
586     optional variable that will be incremeted if preset
587    
588 dpavlin 371 =item C<$rec_size>
589 dpavlin 368
590 dpavlin 371 hash to hold maximum occurances of C<field^subfield> combinations
591 dpavlin 368 (which can be accessed using keys in same format)
592    
593     =back
594    
595 dpavlin 371 Returns value or empty string, updates C<$found> and C<rec_size>
596 dpavlin 368 if present.
597    
598 dpavlin 15 =cut
599    
600     sub get_data {
601     my $self = shift;
602    
603 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
604 dpavlin 15
605 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
606    
607     if (defined($$cache)) {
608 dpavlin 371 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
609 dpavlin 368 }
610    
611     return '' unless ($$rec->{$f}->[$i]);
612    
613     {
614 dpavlin 15 no strict 'refs';
615 dpavlin 368 if (defined($sf)) {
616     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
617 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
618 dpavlin 368 } else {
619 dpavlin 15 $$found++ if (defined($$found));
620 dpavlin 368 # it still might have subfields, just
621     # not specified, so we'll dump some debug info
622 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
623     my $out;
624     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
625 dpavlin 368 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
626 dpavlin 15 }
627     return $out;
628     } else {
629     return $$rec->{$f}->[$i];
630     }
631     }
632     }
633     }
634    
635    
636     =head2 apply_format
637    
638     Apply format specified in tag with C<format_name="name"> and
639     C<format_delimiter=";;">.
640    
641     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
642    
643     Formats can contain C<lookup{...}> if you need them.
644    
645     =cut
646    
647     sub apply_format {
648     my $self = shift;
649    
650     my ($name,$delimiter,$data) = @_;
651    
652     my $log = $self->_get_logger();
653    
654     if (! $self->{'import_xml'}->{'format'}->{$name}) {
655     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
656     return $data;
657     }
658    
659     $log->warn("no delimiter for format $name") if (! $delimiter);
660    
661     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
662    
663     my @data = split(/\Q$delimiter\E/, $data);
664    
665     my $out = sprintf($format, @data);
666     $log->debug("using format $name [$format] on $data to produce: $out");
667    
668     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
669 dpavlin 31 return $self->{'lookup'}->lookup($out);
670 dpavlin 15 } else {
671     return $out;
672     }
673    
674     }
675    
676 dpavlin 13 =head2 sort_arr
677    
678     Sort array ignoring case and html in data
679    
680     my @sorted = $webpac->sort_arr(@unsorted);
681    
682     =cut
683    
684     sub sort_arr {
685     my $self = shift;
686    
687     my $log = $self->_get_logger();
688    
689     # FIXME add Schwartzian Transformation?
690    
691     my @sorted = sort {
692     $a =~ s#<[^>]+/*>##;
693     $b =~ s#<[^>]+/*>##;
694     lc($b) cmp lc($a)
695     } @_;
696     $log->debug("sorted values: ",sub { join(", ",@sorted) });
697    
698     return @sorted;
699     }
700    
701    
702 dpavlin 15 =head1 INTERNAL METHODS
703    
704 dpavlin 13 =head2 _sort_by_order
705    
706     Sort xml tags data structure accoding to C<order=""> attribute.
707    
708     =cut
709    
710     sub _sort_by_order {
711     my $self = shift;
712    
713     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
714     $self->{'import_xml'}->{'indexer'}->{$a};
715     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
716     $self->{'import_xml'}->{'indexer'}->{$b};
717    
718     return $va <=> $vb;
719     }
720    
721     =head2 _x
722    
723 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
724     specific encoding (optinally specified using C<code_page> to C<new>
725     constructor).
726 dpavlin 13
727     my $text = $n->_x('normalize text string');
728    
729     This is a stub so that other modules doesn't have to implement it.
730    
731     =cut
732    
733     sub _x {
734     my $self = shift;
735     return shift;
736     }
737    
738    
739 dpavlin 10 =head1 AUTHOR
740    
741     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
742    
743     =head1 COPYRIGHT & LICENSE
744    
745     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
746    
747     This program is free software; you can redistribute it and/or modify it
748     under the same terms as Perl itself.
749    
750     =cut
751    
752 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26