/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 397 - (hide annotations)
Wed Feb 15 15:54:12 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 17757 byte(s)
 r458@llin:  dpavlin | 2006-02-15 17:01:53 +0100
 fix warnings

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 317 Version 0.08
17 dpavlin 10
18     =cut
19    
20 dpavlin 317 our $VERSION = '0.08';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 13
141     =head2 data_structure
142    
143     Create in-memory data structure which represents normalized layout from
144     C<conf/normalize/*.xml>.
145    
146     This structures are used to produce output.
147    
148 dpavlin 70 my $ds = $webpac->data_structure($rec);
149 dpavlin 13
150     =cut
151    
152     sub data_structure {
153     my $self = shift;
154    
155     my $log = $self->_get_logger();
156    
157     my $rec = shift;
158     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159    
160 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
161    
162 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163 dpavlin 125
164 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
165 dpavlin 125
166 dpavlin 13 my $cache_file;
167    
168 dpavlin 18 if ($self->{'db'}) {
169 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171     return $ds if ($ds);
172 dpavlin 29 $log->debug("cache miss, creating");
173 dpavlin 13 }
174    
175     my @sorted_tags;
176     if ($self->{tags_by_order}) {
177     @sorted_tags = @{$self->{tags_by_order}};
178     } else {
179     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180     $self->{tags_by_order} = \@sorted_tags;
181     }
182    
183 dpavlin 70 my $ds;
184 dpavlin 13
185     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
186    
187     foreach my $field (@sorted_tags) {
188    
189     my $row;
190    
191     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192    
193     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194 dpavlin 38 my $format;
195 dpavlin 13
196 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197     $format = $tag->{'value'} || $tag->{'content'};
198    
199 dpavlin 13 my @v;
200     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'fill_in');
202 dpavlin 13 } else {
203 dpavlin 372 @v = $self->_rec_to_arr($rec,$format,'parse');
204 dpavlin 13 }
205 dpavlin 364 if (! @v) {
206     $log->debug("$field <",$self->{tag},"> format: $format no values");
207 dpavlin 373 next;
208 dpavlin 364 } else {
209     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210     }
211 dpavlin 13
212     if ($tag->{'sort'}) {
213     @v = $self->sort_arr(@v);
214     }
215    
216     # use format?
217     if ($tag->{'format_name'}) {
218     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219     }
220    
221     # delimiter will join repeatable fields
222     if ($tag->{'delimiter'}) {
223     @v = ( join($tag->{'delimiter'}, @v) );
224     }
225    
226     # default types
227 dpavlin 74 my @types = qw(display search);
228 dpavlin 13 # override by type attribute
229     @types = ( $tag->{'type'} ) if ($tag->{'type'});
230    
231     foreach my $type (@types) {
232     # append to previous line?
233 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 dpavlin 13 if ($tag->{'append'}) {
235    
236     # I will delimit appended part with
237     # delimiter (or ,)
238     my $d = $tag->{'delimiter'};
239     # default delimiter
240     $d ||= " ";
241    
242     my $last = pop @{$row->{$type}};
243     $d = "" if (! $last);
244     $last .= $d . join($d, @v);
245     push @{$row->{$type}}, $last;
246    
247     } else {
248     push @{$row->{$type}}, @v;
249     }
250     }
251    
252    
253     }
254    
255     if ($row) {
256     $row->{'tag'} = $field;
257    
258     # TODO: name_sigular, name_plural
259     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
261 dpavlin 13
262     # post-sort all values in field
263     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264     $log->warn("sort at field tag not implemented");
265     }
266    
267 dpavlin 70 $ds->{$row_name} = $row;
268 dpavlin 13
269     $log->debug("row $field: ",sub { Dumper($row) });
270     }
271    
272     }
273    
274 dpavlin 22 $self->{'db'}->save_ds(
275 dpavlin 219 id => $id,
276 dpavlin 70 ds => $ds,
277 dpavlin 219 prefix => $self->{prefix},
278 dpavlin 18 ) if ($self->{'db'});
279 dpavlin 13
280 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
281 dpavlin 29
282 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
283 dpavlin 13
284 dpavlin 70 return $ds;
285    
286 dpavlin 13 }
287    
288     =head2 parse
289    
290     Perform smart parsing of string, skipping delimiters for fields which aren't
291     defined. It can also eval code in format starting with C<eval{...}> and
292     return output or nothing depending on eval code.
293    
294     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295    
296 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
297    
298     filter{name_of_filter}
299    
300     but, filters can also have variable number of parametars like this:
301    
302     filter{name_of_filter(param,param,param)}
303    
304 dpavlin 13 =cut
305    
306 dpavlin 261 my $warn_once;
307    
308 dpavlin 13 sub parse {
309     my $self = shift;
310    
311 dpavlin 371 my ($rec, $format_utf8, $i, $rec_size) = @_;
312 dpavlin 13
313     return if (! $format_utf8);
314    
315     my $log = $self->_get_logger();
316    
317     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318    
319     $i = 0 if (! $i);
320    
321     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322    
323     my @out;
324    
325 dpavlin 340 $log->debug("format: $format [$i]");
326 dpavlin 13
327     my $eval_code;
328     # remove eval{...} from beginning
329     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330    
331     my $filter_name;
332     # remove filter{...} from beginning
333     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334    
335 dpavlin 317 # did we found any (att all) field from format in row?
336 dpavlin 340 my $found_any;
337 dpavlin 317 # prefix before first field which we preserve it $found_any
338 dpavlin 13 my $prefix;
339    
340 dpavlin 317 my $f_step = 1;
341    
342 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
343    
344     my $del = $1 || '';
345 dpavlin 317 $prefix = $del if ($f_step == 1);
346 dpavlin 13
347 dpavlin 340 my $fld_type = lc($2);
348    
349 dpavlin 13 # repeatable index
350     my $r = $i;
351 dpavlin 340 if ($fld_type eq 's') {
352     if ($found_any->{'v'}) {
353     $r = 0;
354     } else {
355     return;
356     }
357     }
358 dpavlin 13
359     my $found = 0;
360 dpavlin 371 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
361 dpavlin 13
362     if ($found) {
363 dpavlin 340 $found_any->{$fld_type} += $found;
364 dpavlin 317
365     # we will skip delimiter before first occurence of field!
366 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
367 dpavlin 13 push @out, $tmp;
368     }
369 dpavlin 317 $f_step++;
370 dpavlin 13 }
371    
372 dpavlin 340 # test if any fields found?
373     return if (! $found_any->{'v'} && ! $found_any->{'s'});
374 dpavlin 13
375     my $out = join('',@out);
376    
377     if ($out) {
378     # add rest of format (suffix)
379     $out .= $format;
380    
381     # add prefix if not there
382     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383    
384     $log->debug("result: $out");
385     }
386    
387     if ($eval_code) {
388     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389     $log->debug("about to eval{$eval} format: $out");
390     return if (! $self->_eval($eval));
391     }
392    
393 dpavlin 260 if ($filter_name) {
394     my @filter_args;
395     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396     @filter_args = split(/,/, $2);
397     }
398     if ($self->{'filter'}->{$filter_name}) {
399     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400     unshift @filter_args, $out;
401     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402     return unless(defined($out));
403     $log->debug("filter result: $out");
404 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
405 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
406 dpavlin 261 $warn_once->{$filter_name}++;
407 dpavlin 260 }
408 dpavlin 13 }
409    
410     return $out;
411     }
412    
413 dpavlin 15 =head2 fill_in
414    
415     Workhourse of all: takes record from in-memory structure of database and
416     strings with placeholders and returns string or array of with substituted
417     values from record.
418    
419     my $text = $webpac->fill_in($rec,'v250^a');
420    
421     Optional argument is ordinal number for repeatable fields. By default,
422     it's assume to be first repeatable field (fields are perl array, so first
423     element is 0).
424     Following example will read second value from repeatable field.
425    
426     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
427    
428     This function B<does not> perform parsing of format to inteligenty skip
429     delimiters before fields which aren't used.
430    
431     This method will automatically decode UTF-8 string to local code page
432     if needed.
433    
434 dpavlin 371 There is optional parametar C<$record_size> which can be used to get sizes of
435     all C<field^subfield> combinations in this format.
436    
437     my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
438    
439 dpavlin 15 =cut
440    
441     sub fill_in {
442     my $self = shift;
443    
444     my $log = $self->_get_logger();
445    
446 dpavlin 371 my ($rec,$format,$i,$rec_size) = @_;
447    
448     $log->logconfess("need data record") unless ($rec);
449     $log->logconfess("need format to parse") unless($format);
450    
451 dpavlin 15 # iteration (for repeatable fields)
452 dpavlin 371 $i ||= 0;
453 dpavlin 15
454     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
455    
456     # FIXME remove for speedup?
457     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458    
459     if (utf8::is_utf8($format)) {
460     $format = $self->_x($format);
461     }
462    
463     my $found = 0;
464 dpavlin 333 my $just_single = 1;
465 dpavlin 15
466     my $eval_code;
467     # remove eval{...} from beginning
468     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
469    
470     my $filter_name;
471     # remove filter{...} from beginning
472     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
473    
474 dpavlin 397 {
475     # fix warnings
476     no warnings 'uninitialized';
477 dpavlin 364
478 dpavlin 397 # do actual replacement of placeholders
479     # repeatable fields
480     if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
481     $just_single = 0;
482     }
483    
484     # non-repeatable fields
485     if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
486     return if ($i > 0 && $just_single);
487     }
488 dpavlin 333 }
489 dpavlin 15
490     if ($found) {
491     $log->debug("format: $format");
492     if ($eval_code) {
493     my $eval = $self->fill_in($rec,$eval_code,$i);
494     return if (! $self->_eval($eval));
495     }
496     if ($filter_name && $self->{'filter'}->{$filter_name}) {
497     $log->debug("filter '$filter_name' for $format");
498     $format = $self->{'filter'}->{$filter_name}->($format);
499     return unless(defined($format));
500     $log->debug("filter result: $format");
501     }
502     # do we have lookups?
503     if ($self->{'lookup'}) {
504 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
505 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
506 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
507 dpavlin 252 return @lookup;
508 dpavlin 31 } else {
509     $log->warn("Have lookup object but can't invoke lookup method");
510     }
511 dpavlin 15 } else {
512     return $format;
513     }
514     } else {
515     return;
516     }
517     }
518    
519    
520 dpavlin 372 =head2 _rec_to_arr
521 dpavlin 13
522 dpavlin 372 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
523 dpavlin 13 for fields which have lookups, so they shouldn't be parsed but rather
524 dpavlin 372 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
525 dpavlin 13
526 dpavlin 372 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
527 dpavlin 13
528     =cut
529    
530 dpavlin 372 sub _rec_to_arr {
531 dpavlin 13 my $self = shift;
532    
533 dpavlin 372 my ($rec, $format_utf8, $code) = @_;
534 dpavlin 13
535     my $log = $self->_get_logger();
536    
537     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
538     return if (! $format_utf8);
539    
540 dpavlin 373 $log->debug("using $code on $format_utf8");
541    
542 dpavlin 13 my $i = 0;
543 dpavlin 372 my $max = 0;
544 dpavlin 13 my @arr;
545 dpavlin 372 my $rec_size = {};
546 dpavlin 13
547 dpavlin 372 while ($i <= $max) {
548 dpavlin 373 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
549 dpavlin 372 if ($rec_size) {
550     foreach my $f (keys %{ $rec_size }) {
551     $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
552     }
553 dpavlin 373 $log->debug("max set to $max");
554 dpavlin 372 undef $rec_size;
555     }
556 dpavlin 373 if (@v) {
557     push @arr, @v;
558     } else {
559 dpavlin 375 push @arr, '' if ($max > $i);
560 dpavlin 373 }
561 dpavlin 13 }
562    
563     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
564    
565     return @arr;
566     }
567    
568 dpavlin 15
569     =head2 get_data
570    
571     Returns value from record.
572    
573 dpavlin 371 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
574 dpavlin 15
575 dpavlin 368 Required arguments are:
576 dpavlin 15
577 dpavlin 368 =over 8
578 dpavlin 15
579 dpavlin 368 =item C<$rec>
580 dpavlin 15
581 dpavlin 368 record reference
582    
583     =item C<$f>
584    
585     field
586    
587     =item C<$sf>
588    
589     optional subfield
590    
591     =item C<$i>
592    
593 dpavlin 371 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
594 dpavlin 368
595     =item C<$found>
596    
597     optional variable that will be incremeted if preset
598    
599 dpavlin 371 =item C<$rec_size>
600 dpavlin 368
601 dpavlin 371 hash to hold maximum occurances of C<field^subfield> combinations
602 dpavlin 368 (which can be accessed using keys in same format)
603    
604     =back
605    
606 dpavlin 371 Returns value or empty string, updates C<$found> and C<rec_size>
607 dpavlin 368 if present.
608    
609 dpavlin 15 =cut
610    
611     sub get_data {
612     my $self = shift;
613    
614 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
615 dpavlin 15
616 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
617    
618     if (defined($$cache)) {
619 dpavlin 371 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
620 dpavlin 368 }
621    
622     return '' unless ($$rec->{$f}->[$i]);
623    
624     {
625 dpavlin 15 no strict 'refs';
626 dpavlin 368 if (defined($sf)) {
627     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
628 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
629 dpavlin 368 } else {
630 dpavlin 15 $$found++ if (defined($$found));
631 dpavlin 368 # it still might have subfields, just
632     # not specified, so we'll dump some debug info
633 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
634     my $out;
635     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
636 dpavlin 368 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
637 dpavlin 15 }
638     return $out;
639     } else {
640     return $$rec->{$f}->[$i];
641     }
642     }
643     }
644     }
645    
646    
647     =head2 apply_format
648    
649     Apply format specified in tag with C<format_name="name"> and
650     C<format_delimiter=";;">.
651    
652     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
653    
654     Formats can contain C<lookup{...}> if you need them.
655    
656     =cut
657    
658     sub apply_format {
659     my $self = shift;
660    
661     my ($name,$delimiter,$data) = @_;
662    
663     my $log = $self->_get_logger();
664    
665     if (! $self->{'import_xml'}->{'format'}->{$name}) {
666     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
667     return $data;
668     }
669    
670     $log->warn("no delimiter for format $name") if (! $delimiter);
671    
672     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
673    
674     my @data = split(/\Q$delimiter\E/, $data);
675    
676     my $out = sprintf($format, @data);
677     $log->debug("using format $name [$format] on $data to produce: $out");
678    
679     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
680 dpavlin 31 return $self->{'lookup'}->lookup($out);
681 dpavlin 15 } else {
682     return $out;
683     }
684    
685     }
686    
687 dpavlin 13 =head2 sort_arr
688    
689     Sort array ignoring case and html in data
690    
691     my @sorted = $webpac->sort_arr(@unsorted);
692    
693     =cut
694    
695     sub sort_arr {
696     my $self = shift;
697    
698     my $log = $self->_get_logger();
699    
700     # FIXME add Schwartzian Transformation?
701    
702     my @sorted = sort {
703     $a =~ s#<[^>]+/*>##;
704     $b =~ s#<[^>]+/*>##;
705     lc($b) cmp lc($a)
706     } @_;
707     $log->debug("sorted values: ",sub { join(", ",@sorted) });
708    
709     return @sorted;
710     }
711    
712    
713 dpavlin 15 =head1 INTERNAL METHODS
714    
715 dpavlin 13 =head2 _sort_by_order
716    
717     Sort xml tags data structure accoding to C<order=""> attribute.
718    
719     =cut
720    
721     sub _sort_by_order {
722     my $self = shift;
723    
724     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
725     $self->{'import_xml'}->{'indexer'}->{$a};
726     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
727     $self->{'import_xml'}->{'indexer'}->{$b};
728    
729     return $va <=> $vb;
730     }
731    
732     =head2 _x
733    
734 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
735     specific encoding (optinally specified using C<code_page> to C<new>
736     constructor).
737 dpavlin 13
738     my $text = $n->_x('normalize text string');
739    
740     This is a stub so that other modules doesn't have to implement it.
741    
742     =cut
743    
744     sub _x {
745     my $self = shift;
746     return shift;
747     }
748    
749    
750 dpavlin 10 =head1 AUTHOR
751    
752     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
753    
754     =head1 COPYRIGHT & LICENSE
755    
756     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
757    
758     This program is free software; you can redistribute it and/or modify it
759     under the same terms as Perl itself.
760    
761     =cut
762    
763 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26