/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 371 - (hide annotations)
Sun Jan 8 21:16:27 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 17970 byte(s)
 r409@llin:  dpavlin | 2006-01-08 22:16:39 +0100
 collect record sizes

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 317 Version 0.08
17 dpavlin 10
18     =cut
19    
20 dpavlin 317 our $VERSION = '0.08';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 13
141     =head2 data_structure
142    
143     Create in-memory data structure which represents normalized layout from
144     C<conf/normalize/*.xml>.
145    
146     This structures are used to produce output.
147    
148 dpavlin 70 my $ds = $webpac->data_structure($rec);
149 dpavlin 13
150     =cut
151    
152     sub data_structure {
153     my $self = shift;
154    
155     my $log = $self->_get_logger();
156    
157     my $rec = shift;
158     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159    
160 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
161    
162 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163 dpavlin 125
164 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
165 dpavlin 125
166 dpavlin 13 my $cache_file;
167    
168 dpavlin 18 if ($self->{'db'}) {
169 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171     return $ds if ($ds);
172 dpavlin 29 $log->debug("cache miss, creating");
173 dpavlin 13 }
174    
175     my @sorted_tags;
176     if ($self->{tags_by_order}) {
177     @sorted_tags = @{$self->{tags_by_order}};
178     } else {
179     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180     $self->{tags_by_order} = \@sorted_tags;
181     }
182    
183 dpavlin 70 my $ds;
184 dpavlin 13
185     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
186    
187     foreach my $field (@sorted_tags) {
188    
189     my $row;
190    
191     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192    
193     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194 dpavlin 38 my $format;
195 dpavlin 13
196 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197     $format = $tag->{'value'} || $tag->{'content'};
198    
199 dpavlin 13 my @v;
200     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201     @v = $self->fill_in_to_arr($rec,$format);
202     } else {
203     @v = $self->parse_to_arr($rec,$format);
204     }
205 dpavlin 364 if (! @v) {
206     $log->debug("$field <",$self->{tag},"> format: $format no values");
207     # next;
208     } else {
209     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210     }
211 dpavlin 13
212     if ($tag->{'sort'}) {
213     @v = $self->sort_arr(@v);
214     }
215    
216     # use format?
217     if ($tag->{'format_name'}) {
218     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219     }
220    
221     # delimiter will join repeatable fields
222     if ($tag->{'delimiter'}) {
223     @v = ( join($tag->{'delimiter'}, @v) );
224     }
225    
226     # default types
227 dpavlin 74 my @types = qw(display search);
228 dpavlin 13 # override by type attribute
229     @types = ( $tag->{'type'} ) if ($tag->{'type'});
230    
231     foreach my $type (@types) {
232     # append to previous line?
233 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 dpavlin 13 if ($tag->{'append'}) {
235    
236     # I will delimit appended part with
237     # delimiter (or ,)
238     my $d = $tag->{'delimiter'};
239     # default delimiter
240     $d ||= " ";
241    
242     my $last = pop @{$row->{$type}};
243     $d = "" if (! $last);
244     $last .= $d . join($d, @v);
245     push @{$row->{$type}}, $last;
246    
247     } else {
248     push @{$row->{$type}}, @v;
249     }
250     }
251    
252    
253     }
254    
255     if ($row) {
256     $row->{'tag'} = $field;
257    
258     # TODO: name_sigular, name_plural
259     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
261 dpavlin 13
262     # post-sort all values in field
263     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264     $log->warn("sort at field tag not implemented");
265     }
266    
267 dpavlin 70 $ds->{$row_name} = $row;
268 dpavlin 13
269     $log->debug("row $field: ",sub { Dumper($row) });
270     }
271    
272     }
273    
274 dpavlin 22 $self->{'db'}->save_ds(
275 dpavlin 219 id => $id,
276 dpavlin 70 ds => $ds,
277 dpavlin 219 prefix => $self->{prefix},
278 dpavlin 18 ) if ($self->{'db'});
279 dpavlin 13
280 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
281 dpavlin 29
282 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
283 dpavlin 13
284 dpavlin 70 return $ds;
285    
286 dpavlin 13 }
287    
288     =head2 parse
289    
290     Perform smart parsing of string, skipping delimiters for fields which aren't
291     defined. It can also eval code in format starting with C<eval{...}> and
292     return output or nothing depending on eval code.
293    
294     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295    
296 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
297    
298     filter{name_of_filter}
299    
300     but, filters can also have variable number of parametars like this:
301    
302     filter{name_of_filter(param,param,param)}
303    
304 dpavlin 13 =cut
305    
306 dpavlin 261 my $warn_once;
307    
308 dpavlin 13 sub parse {
309     my $self = shift;
310    
311 dpavlin 371 my ($rec, $format_utf8, $i, $rec_size) = @_;
312 dpavlin 13
313     return if (! $format_utf8);
314    
315     my $log = $self->_get_logger();
316    
317     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318    
319     $i = 0 if (! $i);
320    
321     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322    
323     my @out;
324    
325 dpavlin 340 $log->debug("format: $format [$i]");
326 dpavlin 13
327     my $eval_code;
328     # remove eval{...} from beginning
329     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330    
331     my $filter_name;
332     # remove filter{...} from beginning
333     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334    
335 dpavlin 317 # did we found any (att all) field from format in row?
336 dpavlin 340 my $found_any;
337 dpavlin 317 # prefix before first field which we preserve it $found_any
338 dpavlin 13 my $prefix;
339    
340 dpavlin 317 my $f_step = 1;
341    
342 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
343    
344     my $del = $1 || '';
345 dpavlin 317 $prefix = $del if ($f_step == 1);
346 dpavlin 13
347 dpavlin 340 my $fld_type = lc($2);
348    
349 dpavlin 13 # repeatable index
350     my $r = $i;
351 dpavlin 340 if ($fld_type eq 's') {
352     if ($found_any->{'v'}) {
353     $r = 0;
354     } else {
355     return;
356     }
357     }
358 dpavlin 13
359     my $found = 0;
360 dpavlin 371 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
361 dpavlin 13
362     if ($found) {
363 dpavlin 340 $found_any->{$fld_type} += $found;
364 dpavlin 317
365     # we will skip delimiter before first occurence of field!
366 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
367 dpavlin 13 push @out, $tmp;
368     }
369 dpavlin 317 $f_step++;
370 dpavlin 13 }
371    
372 dpavlin 340 # test if any fields found?
373     return if (! $found_any->{'v'} && ! $found_any->{'s'});
374 dpavlin 13
375     my $out = join('',@out);
376    
377     if ($out) {
378     # add rest of format (suffix)
379     $out .= $format;
380    
381     # add prefix if not there
382     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383    
384     $log->debug("result: $out");
385     }
386    
387     if ($eval_code) {
388     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389     $log->debug("about to eval{$eval} format: $out");
390     return if (! $self->_eval($eval));
391     }
392    
393 dpavlin 260 if ($filter_name) {
394     my @filter_args;
395     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396     @filter_args = split(/,/, $2);
397     }
398     if ($self->{'filter'}->{$filter_name}) {
399     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400     unshift @filter_args, $out;
401     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402     return unless(defined($out));
403     $log->debug("filter result: $out");
404 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
405 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
406 dpavlin 261 $warn_once->{$filter_name}++;
407 dpavlin 260 }
408 dpavlin 13 }
409    
410     return $out;
411     }
412    
413     =head2 parse_to_arr
414    
415     Similar to C<parse>, but returns array of all repeatable fields
416    
417     my @arr = $webpac->parse_to_arr($rec,'v250^a');
418    
419     =cut
420    
421     sub parse_to_arr {
422     my $self = shift;
423    
424     my ($rec, $format_utf8) = @_;
425    
426     my $log = $self->_get_logger();
427    
428     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429     return if (! $format_utf8);
430    
431     my $i = 0;
432     my @arr;
433    
434 dpavlin 371 my $rec_size = { '_' => '_' };
435    
436     while (my $v = $self->parse($rec,$format_utf8,$i++,\$rec_size)) {
437 dpavlin 13 push @arr, $v;
438 dpavlin 371 warn "parse rec_size = ", Dumper($rec_size);
439 dpavlin 13 }
440    
441     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
442    
443     return @arr;
444     }
445    
446 dpavlin 15
447     =head2 fill_in
448    
449     Workhourse of all: takes record from in-memory structure of database and
450     strings with placeholders and returns string or array of with substituted
451     values from record.
452    
453     my $text = $webpac->fill_in($rec,'v250^a');
454    
455     Optional argument is ordinal number for repeatable fields. By default,
456     it's assume to be first repeatable field (fields are perl array, so first
457     element is 0).
458     Following example will read second value from repeatable field.
459    
460     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
461    
462     This function B<does not> perform parsing of format to inteligenty skip
463     delimiters before fields which aren't used.
464    
465     This method will automatically decode UTF-8 string to local code page
466     if needed.
467    
468 dpavlin 371 There is optional parametar C<$record_size> which can be used to get sizes of
469     all C<field^subfield> combinations in this format.
470    
471     my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
472    
473 dpavlin 15 =cut
474    
475     sub fill_in {
476     my $self = shift;
477    
478     my $log = $self->_get_logger();
479    
480 dpavlin 371 my ($rec,$format,$i,$rec_size) = @_;
481    
482     $log->logconfess("need data record") unless ($rec);
483     $log->logconfess("need format to parse") unless($format);
484    
485 dpavlin 15 # iteration (for repeatable fields)
486 dpavlin 371 $i ||= 0;
487 dpavlin 15
488     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
489    
490     # FIXME remove for speedup?
491     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
492    
493     if (utf8::is_utf8($format)) {
494     $format = $self->_x($format);
495     }
496    
497     my $found = 0;
498 dpavlin 333 my $just_single = 1;
499 dpavlin 15
500     my $eval_code;
501     # remove eval{...} from beginning
502     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
503    
504     my $filter_name;
505     # remove filter{...} from beginning
506     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
507    
508     # do actual replacement of placeholders
509     # repeatable fields
510 dpavlin 371 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
511 dpavlin 333 $just_single = 0;
512     }
513 dpavlin 364
514 dpavlin 15 # non-repeatable fields
515 dpavlin 371 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
516 dpavlin 333 return if ($i > 0 && $just_single);
517     }
518 dpavlin 15
519     if ($found) {
520     $log->debug("format: $format");
521     if ($eval_code) {
522     my $eval = $self->fill_in($rec,$eval_code,$i);
523     return if (! $self->_eval($eval));
524     }
525     if ($filter_name && $self->{'filter'}->{$filter_name}) {
526     $log->debug("filter '$filter_name' for $format");
527     $format = $self->{'filter'}->{$filter_name}->($format);
528     return unless(defined($format));
529     $log->debug("filter result: $format");
530     }
531     # do we have lookups?
532     if ($self->{'lookup'}) {
533 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
534 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
535 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
536 dpavlin 252 return @lookup;
537 dpavlin 31 } else {
538     $log->warn("Have lookup object but can't invoke lookup method");
539     }
540 dpavlin 15 } else {
541     return $format;
542     }
543     } else {
544     return;
545     }
546     }
547    
548    
549 dpavlin 13 =head2 fill_in_to_arr
550    
551     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
552     for fields which have lookups, so they shouldn't be parsed but rather
553     C<fill_id>ed.
554    
555     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
556    
557     =cut
558    
559     sub fill_in_to_arr {
560     my $self = shift;
561    
562     my ($rec, $format_utf8) = @_;
563    
564     my $log = $self->_get_logger();
565    
566     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
567     return if (! $format_utf8);
568    
569     my $i = 0;
570     my @arr;
571    
572 dpavlin 371 my $rec_size;
573    
574     while (my $v = $self->fill_in($rec,$format_utf8,$i,\$rec_size)) {
575 dpavlin 364 push @arr, $v;
576 dpavlin 371 warn "rec_size = ", Dumper($rec_size);
577 dpavlin 13 }
578    
579     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
580    
581     return @arr;
582     }
583    
584 dpavlin 15
585     =head2 get_data
586    
587     Returns value from record.
588    
589 dpavlin 371 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
590 dpavlin 15
591 dpavlin 368 Required arguments are:
592 dpavlin 15
593 dpavlin 368 =over 8
594 dpavlin 15
595 dpavlin 368 =item C<$rec>
596 dpavlin 15
597 dpavlin 368 record reference
598    
599     =item C<$f>
600    
601     field
602    
603     =item C<$sf>
604    
605     optional subfield
606    
607     =item C<$i>
608    
609 dpavlin 371 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
610 dpavlin 368
611     =item C<$found>
612    
613     optional variable that will be incremeted if preset
614    
615 dpavlin 371 =item C<$rec_size>
616 dpavlin 368
617 dpavlin 371 hash to hold maximum occurances of C<field^subfield> combinations
618 dpavlin 368 (which can be accessed using keys in same format)
619    
620     =back
621    
622 dpavlin 371 Returns value or empty string, updates C<$found> and C<rec_size>
623 dpavlin 368 if present.
624    
625 dpavlin 15 =cut
626    
627     sub get_data {
628     my $self = shift;
629    
630 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
631 dpavlin 15
632 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
633    
634     if (defined($$cache)) {
635 dpavlin 371 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
636 dpavlin 368 }
637    
638     return '' unless ($$rec->{$f}->[$i]);
639    
640     {
641 dpavlin 15 no strict 'refs';
642 dpavlin 368 if (defined($sf)) {
643     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
644 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
645 dpavlin 368 } else {
646 dpavlin 15 $$found++ if (defined($$found));
647 dpavlin 368 # it still might have subfields, just
648     # not specified, so we'll dump some debug info
649 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
650     my $out;
651     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
652 dpavlin 368 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
653 dpavlin 15 }
654     return $out;
655     } else {
656     return $$rec->{$f}->[$i];
657     }
658     }
659     }
660     }
661    
662    
663     =head2 apply_format
664    
665     Apply format specified in tag with C<format_name="name"> and
666     C<format_delimiter=";;">.
667    
668     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
669    
670     Formats can contain C<lookup{...}> if you need them.
671    
672     =cut
673    
674     sub apply_format {
675     my $self = shift;
676    
677     my ($name,$delimiter,$data) = @_;
678    
679     my $log = $self->_get_logger();
680    
681     if (! $self->{'import_xml'}->{'format'}->{$name}) {
682     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
683     return $data;
684     }
685    
686     $log->warn("no delimiter for format $name") if (! $delimiter);
687    
688     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
689    
690     my @data = split(/\Q$delimiter\E/, $data);
691    
692     my $out = sprintf($format, @data);
693     $log->debug("using format $name [$format] on $data to produce: $out");
694    
695     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
696 dpavlin 31 return $self->{'lookup'}->lookup($out);
697 dpavlin 15 } else {
698     return $out;
699     }
700    
701     }
702    
703 dpavlin 13 =head2 sort_arr
704    
705     Sort array ignoring case and html in data
706    
707     my @sorted = $webpac->sort_arr(@unsorted);
708    
709     =cut
710    
711     sub sort_arr {
712     my $self = shift;
713    
714     my $log = $self->_get_logger();
715    
716     # FIXME add Schwartzian Transformation?
717    
718     my @sorted = sort {
719     $a =~ s#<[^>]+/*>##;
720     $b =~ s#<[^>]+/*>##;
721     lc($b) cmp lc($a)
722     } @_;
723     $log->debug("sorted values: ",sub { join(", ",@sorted) });
724    
725     return @sorted;
726     }
727    
728    
729 dpavlin 15 =head1 INTERNAL METHODS
730    
731 dpavlin 13 =head2 _sort_by_order
732    
733     Sort xml tags data structure accoding to C<order=""> attribute.
734    
735     =cut
736    
737     sub _sort_by_order {
738     my $self = shift;
739    
740     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
741     $self->{'import_xml'}->{'indexer'}->{$a};
742     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
743     $self->{'import_xml'}->{'indexer'}->{$b};
744    
745     return $va <=> $vb;
746     }
747    
748     =head2 _x
749    
750 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
751     specific encoding (optinally specified using C<code_page> to C<new>
752     constructor).
753 dpavlin 13
754     my $text = $n->_x('normalize text string');
755    
756     This is a stub so that other modules doesn't have to implement it.
757    
758     =cut
759    
760     sub _x {
761     my $self = shift;
762     return shift;
763     }
764    
765    
766 dpavlin 10 =head1 AUTHOR
767    
768     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
769    
770     =head1 COPYRIGHT & LICENSE
771    
772     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
773    
774     This program is free software; you can redistribute it and/or modify it
775     under the same terms as Perl itself.
776    
777     =cut
778    
779 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26