/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 368 - (hide annotations)
Sun Jan 8 20:32:06 2006 UTC (17 years, 1 month ago) by dpavlin
File size: 17529 byte(s)
 r403@llin:  dpavlin | 2006-01-08 21:31:43 +0100
 refactor and better document get_data

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 368 use blib;
6     use WebPAC::Common;
7 dpavlin 29 use base 'WebPAC::Common';
8 dpavlin 13 use Data::Dumper;
9 dpavlin 10
10     =head1 NAME
11    
12 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
13 dpavlin 10
14     =head1 VERSION
15    
16 dpavlin 317 Version 0.08
17 dpavlin 10
18     =cut
19    
20 dpavlin 317 our $VERSION = '0.08';
21 dpavlin 10
22     =head1 SYNOPSIS
23    
24 dpavlin 15 This package contains code that mungle data to produce normalized format.
25 dpavlin 10
26 dpavlin 15 It contains several assumptions:
27    
28     =over
29    
30     =item *
31    
32     format of fields is defined using C<v123^a> notation for repeatable fields
33     or C<s123^a> for single (or first) value, where C<123> is field number and
34     C<a> is subfield.
35    
36     =item *
37    
38     source data records (C<$rec>) have unique identifiers in field C<000>
39    
40     =item *
41    
42     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43     perl code that is evaluated before producing output (value of field will be
44     interpolated before that)
45    
46     =item *
47    
48     optional C<filter{filter_name}> at B<begining of format> will apply perl
49     code defined as code ref on format after field substitution to producing
50     output
51    
52 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
53    
54     filter{regex(s/foo/bar/)}
55    
56 dpavlin 15 =item *
57    
58     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59    
60     =item *
61    
62     at end, optional C<format>s rules are resolved. Format rules are similar to
63     C<sprintf> and can also contain C<lookup{...}> which is performed after
64     values are inserted in format.
65    
66     =back
67    
68     This also describes order in which transformations are applied (eval,
69     filter, lookup, format) which is important to undestand when deciding how to
70     solve your data mungling and normalisation process.
71    
72    
73    
74    
75 dpavlin 10 =head1 FUNCTIONS
76    
77 dpavlin 13 =head2 new
78 dpavlin 10
79 dpavlin 13 Create new normalisation object
80    
81     my $n = new WebPAC::Normalize::Something(
82 dpavlin 15 filter => {
83     'filter_name_1' => sub {
84     # filter code
85     return length($_);
86     }, ...
87     },
88 dpavlin 29 db => $db_obj,
89 dpavlin 13 lookup_regex => $lookup->regex,
90 dpavlin 31 lookup => $lookup_obj,
91 dpavlin 219 prefix => 'foobar',
92 dpavlin 13 );
93    
94 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
95     be use with C<filter{...}> notation.
96    
97 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
98     source files which are joined in one database).
99    
100 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
102     which is C<WebPAC::Lookup> object.
103 dpavlin 13
104 dpavlin 10 =cut
105    
106 dpavlin 13 sub new {
107     my $class = shift;
108     my $self = {@_};
109     bless($self, $class);
110    
111 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
112     my $l = $self->{'lookup'} ? 1 : 0;
113    
114     my $log = $self->_get_logger();
115    
116     # those two must be in pair
117     if ( ($r & $l) != ($r || $l) ) {
118     my $log = $self->_get_logger();
119     $log->logdie("lookup_regex and lookup must be in pair");
120     }
121    
122     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123    
124 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125    
126 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127    
128 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 dpavlin 260 $log->debug("adding built-in filter regex");
130     $self->{filter}->{regex} = sub {
131     my ($val, $regex) = @_;
132     eval "\$val =~ $regex";
133     return $val;
134     };
135     }
136    
137 dpavlin 13 $self ? return $self : return undef;
138 dpavlin 10 }
139    
140 dpavlin 13
141     =head2 data_structure
142    
143     Create in-memory data structure which represents normalized layout from
144     C<conf/normalize/*.xml>.
145    
146     This structures are used to produce output.
147    
148 dpavlin 70 my $ds = $webpac->data_structure($rec);
149 dpavlin 13
150     =cut
151    
152     sub data_structure {
153     my $self = shift;
154    
155     my $log = $self->_get_logger();
156    
157     my $rec = shift;
158     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159    
160 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
161    
162 dpavlin 312 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163 dpavlin 125
164 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
165 dpavlin 125
166 dpavlin 13 my $cache_file;
167    
168 dpavlin 18 if ($self->{'db'}) {
169 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171     return $ds if ($ds);
172 dpavlin 29 $log->debug("cache miss, creating");
173 dpavlin 13 }
174    
175     my @sorted_tags;
176     if ($self->{tags_by_order}) {
177     @sorted_tags = @{$self->{tags_by_order}};
178     } else {
179     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180     $self->{tags_by_order} = \@sorted_tags;
181     }
182    
183 dpavlin 70 my $ds;
184 dpavlin 13
185     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
186    
187     foreach my $field (@sorted_tags) {
188    
189     my $row;
190    
191     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192    
193     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194 dpavlin 38 my $format;
195 dpavlin 13
196 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197     $format = $tag->{'value'} || $tag->{'content'};
198    
199 dpavlin 13 my @v;
200     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201     @v = $self->fill_in_to_arr($rec,$format);
202     } else {
203     @v = $self->parse_to_arr($rec,$format);
204     }
205 dpavlin 364 if (! @v) {
206     $log->debug("$field <",$self->{tag},"> format: $format no values");
207     # next;
208     } else {
209     $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210     }
211 dpavlin 13
212     if ($tag->{'sort'}) {
213     @v = $self->sort_arr(@v);
214     }
215    
216     # use format?
217     if ($tag->{'format_name'}) {
218     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219     }
220    
221     # delimiter will join repeatable fields
222     if ($tag->{'delimiter'}) {
223     @v = ( join($tag->{'delimiter'}, @v) );
224     }
225    
226     # default types
227 dpavlin 74 my @types = qw(display search);
228 dpavlin 13 # override by type attribute
229     @types = ( $tag->{'type'} ) if ($tag->{'type'});
230    
231     foreach my $type (@types) {
232     # append to previous line?
233 dpavlin 364 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 dpavlin 13 if ($tag->{'append'}) {
235    
236     # I will delimit appended part with
237     # delimiter (or ,)
238     my $d = $tag->{'delimiter'};
239     # default delimiter
240     $d ||= " ";
241    
242     my $last = pop @{$row->{$type}};
243     $d = "" if (! $last);
244     $last .= $d . join($d, @v);
245     push @{$row->{$type}}, $last;
246    
247     } else {
248     push @{$row->{$type}}, @v;
249     }
250     }
251    
252    
253     }
254    
255     if ($row) {
256     $row->{'tag'} = $field;
257    
258     # TODO: name_sigular, name_plural
259     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
261 dpavlin 13
262     # post-sort all values in field
263     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264     $log->warn("sort at field tag not implemented");
265     }
266    
267 dpavlin 70 $ds->{$row_name} = $row;
268 dpavlin 13
269     $log->debug("row $field: ",sub { Dumper($row) });
270     }
271    
272     }
273    
274 dpavlin 22 $self->{'db'}->save_ds(
275 dpavlin 219 id => $id,
276 dpavlin 70 ds => $ds,
277 dpavlin 219 prefix => $self->{prefix},
278 dpavlin 18 ) if ($self->{'db'});
279 dpavlin 13
280 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
281 dpavlin 29
282 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
283 dpavlin 13
284 dpavlin 70 return $ds;
285    
286 dpavlin 13 }
287    
288     =head2 parse
289    
290     Perform smart parsing of string, skipping delimiters for fields which aren't
291     defined. It can also eval code in format starting with C<eval{...}> and
292     return output or nothing depending on eval code.
293    
294     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295    
296 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
297    
298     filter{name_of_filter}
299    
300     but, filters can also have variable number of parametars like this:
301    
302     filter{name_of_filter(param,param,param)}
303    
304 dpavlin 13 =cut
305    
306 dpavlin 261 my $warn_once;
307    
308 dpavlin 13 sub parse {
309     my $self = shift;
310    
311     my ($rec, $format_utf8, $i) = @_;
312    
313     return if (! $format_utf8);
314    
315     my $log = $self->_get_logger();
316    
317     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318    
319     $i = 0 if (! $i);
320    
321     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322    
323     my @out;
324    
325 dpavlin 340 $log->debug("format: $format [$i]");
326 dpavlin 13
327     my $eval_code;
328     # remove eval{...} from beginning
329     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330    
331     my $filter_name;
332     # remove filter{...} from beginning
333     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334    
335 dpavlin 317 # did we found any (att all) field from format in row?
336 dpavlin 340 my $found_any;
337 dpavlin 317 # prefix before first field which we preserve it $found_any
338 dpavlin 13 my $prefix;
339    
340 dpavlin 317 my $f_step = 1;
341    
342 dpavlin 13 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
343    
344     my $del = $1 || '';
345 dpavlin 317 $prefix = $del if ($f_step == 1);
346 dpavlin 13
347 dpavlin 340 my $fld_type = lc($2);
348    
349 dpavlin 13 # repeatable index
350     my $r = $i;
351 dpavlin 340 if ($fld_type eq 's') {
352     if ($found_any->{'v'}) {
353     $r = 0;
354     } else {
355     return;
356     }
357     }
358 dpavlin 13
359     my $found = 0;
360     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
361    
362     if ($found) {
363 dpavlin 340 $found_any->{$fld_type} += $found;
364 dpavlin 317
365     # we will skip delimiter before first occurence of field!
366 dpavlin 344 push @out, $del unless($found_any->{$fld_type} == 1);
367 dpavlin 13 push @out, $tmp;
368     }
369 dpavlin 317 $f_step++;
370 dpavlin 13 }
371    
372 dpavlin 340 # test if any fields found?
373     return if (! $found_any->{'v'} && ! $found_any->{'s'});
374 dpavlin 13
375     my $out = join('',@out);
376    
377     if ($out) {
378     # add rest of format (suffix)
379     $out .= $format;
380    
381     # add prefix if not there
382     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383    
384     $log->debug("result: $out");
385     }
386    
387     if ($eval_code) {
388     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389     $log->debug("about to eval{$eval} format: $out");
390     return if (! $self->_eval($eval));
391     }
392    
393 dpavlin 260 if ($filter_name) {
394     my @filter_args;
395     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396     @filter_args = split(/,/, $2);
397     }
398     if ($self->{'filter'}->{$filter_name}) {
399     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400     unshift @filter_args, $out;
401     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402     return unless(defined($out));
403     $log->debug("filter result: $out");
404 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
405 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
406 dpavlin 261 $warn_once->{$filter_name}++;
407 dpavlin 260 }
408 dpavlin 13 }
409    
410     return $out;
411     }
412    
413     =head2 parse_to_arr
414    
415     Similar to C<parse>, but returns array of all repeatable fields
416    
417     my @arr = $webpac->parse_to_arr($rec,'v250^a');
418    
419     =cut
420    
421     sub parse_to_arr {
422     my $self = shift;
423    
424     my ($rec, $format_utf8) = @_;
425    
426     my $log = $self->_get_logger();
427    
428     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429     return if (! $format_utf8);
430    
431     my $i = 0;
432     my @arr;
433    
434     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
435     push @arr, $v;
436     }
437    
438     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
439    
440     return @arr;
441     }
442    
443 dpavlin 15
444     =head2 fill_in
445    
446     Workhourse of all: takes record from in-memory structure of database and
447     strings with placeholders and returns string or array of with substituted
448     values from record.
449    
450     my $text = $webpac->fill_in($rec,'v250^a');
451    
452     Optional argument is ordinal number for repeatable fields. By default,
453     it's assume to be first repeatable field (fields are perl array, so first
454     element is 0).
455     Following example will read second value from repeatable field.
456    
457     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458    
459     This function B<does not> perform parsing of format to inteligenty skip
460     delimiters before fields which aren't used.
461    
462     This method will automatically decode UTF-8 string to local code page
463     if needed.
464    
465     =cut
466    
467     sub fill_in {
468     my $self = shift;
469    
470     my $log = $self->_get_logger();
471    
472     my $rec = shift || $log->logconfess("need data record");
473     my $format = shift || $log->logconfess("need format to parse");
474     # iteration (for repeatable fields)
475     my $i = shift || 0;
476    
477     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
478    
479     # FIXME remove for speedup?
480     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481    
482     if (utf8::is_utf8($format)) {
483     $format = $self->_x($format);
484     }
485    
486     my $found = 0;
487 dpavlin 333 my $just_single = 1;
488 dpavlin 15
489     my $eval_code;
490     # remove eval{...} from beginning
491     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
492    
493     my $filter_name;
494     # remove filter{...} from beginning
495     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
496    
497     # do actual replacement of placeholders
498     # repeatable fields
499 dpavlin 333 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
500     $just_single = 0;
501     }
502 dpavlin 364
503 dpavlin 15 # non-repeatable fields
504 dpavlin 333 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
505     return if ($i > 0 && $just_single);
506     }
507 dpavlin 15
508     if ($found) {
509     $log->debug("format: $format");
510     if ($eval_code) {
511     my $eval = $self->fill_in($rec,$eval_code,$i);
512     return if (! $self->_eval($eval));
513     }
514     if ($filter_name && $self->{'filter'}->{$filter_name}) {
515     $log->debug("filter '$filter_name' for $format");
516     $format = $self->{'filter'}->{$filter_name}->($format);
517     return unless(defined($format));
518     $log->debug("filter result: $format");
519     }
520     # do we have lookups?
521     if ($self->{'lookup'}) {
522 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
523 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
524 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
525 dpavlin 252 return @lookup;
526 dpavlin 31 } else {
527     $log->warn("Have lookup object but can't invoke lookup method");
528     }
529 dpavlin 15 } else {
530     return $format;
531     }
532     } else {
533     return;
534     }
535     }
536    
537    
538 dpavlin 13 =head2 fill_in_to_arr
539    
540     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
541     for fields which have lookups, so they shouldn't be parsed but rather
542     C<fill_id>ed.
543    
544     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
545    
546     =cut
547    
548     sub fill_in_to_arr {
549     my $self = shift;
550    
551     my ($rec, $format_utf8) = @_;
552    
553     my $log = $self->_get_logger();
554    
555     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
556     return if (! $format_utf8);
557    
558     my $i = 0;
559     my @arr;
560    
561 dpavlin 364 while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
562     push @arr, $v;
563 dpavlin 13 }
564    
565     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
566    
567     return @arr;
568     }
569    
570 dpavlin 15
571     =head2 get_data
572    
573     Returns value from record.
574    
575 dpavlin 368 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$fld_occurances);
576 dpavlin 15
577 dpavlin 368 Required arguments are:
578 dpavlin 15
579 dpavlin 368 =over 8
580 dpavlin 15
581 dpavlin 368 =item C<$rec>
582 dpavlin 15
583 dpavlin 368 record reference
584    
585     =item C<$f>
586    
587     field
588    
589     =item C<$sf>
590    
591     optional subfield
592    
593     =item C<$i>
594    
595     index offset for repeatable values ( 0 ... $#occurances )
596    
597     =item C<$found>
598    
599     optional variable that will be incremeted if preset
600    
601     =item C<$fld_occurances>
602    
603     hash to hold maximum occurances of C<field\tsubfield> combinations
604     (which can be accessed using keys in same format)
605    
606     =back
607    
608     Returns value or empty string, updates C<$found> and C<fld_occurences>
609     if present.
610    
611 dpavlin 15 =cut
612    
613     sub get_data {
614     my $self = shift;
615    
616 dpavlin 368 my ($rec,$f,$sf,$i,$found,$cache) = @_;
617 dpavlin 15
618 dpavlin 368 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
619    
620     if (defined($$cache)) {
621     $$cache->{"$f\t$sf"} ||= $$#rec->{$f};
622     }
623    
624     return '' unless ($$rec->{$f}->[$i]);
625    
626     {
627 dpavlin 15 no strict 'refs';
628 dpavlin 368 if (defined($sf)) {
629     $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
630 dpavlin 15 return $$rec->{$f}->[$i]->{$sf};
631 dpavlin 368 } else {
632 dpavlin 15 $$found++ if (defined($$found));
633 dpavlin 368 # it still might have subfields, just
634     # not specified, so we'll dump some debug info
635 dpavlin 15 if ($$rec->{$f}->[$i] =~ /HASH/o) {
636     my $out;
637     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
638 dpavlin 368 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
639 dpavlin 15 }
640     return $out;
641     } else {
642     return $$rec->{$f}->[$i];
643     }
644     }
645     }
646     }
647    
648    
649     =head2 apply_format
650    
651     Apply format specified in tag with C<format_name="name"> and
652     C<format_delimiter=";;">.
653    
654     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
655    
656     Formats can contain C<lookup{...}> if you need them.
657    
658     =cut
659    
660     sub apply_format {
661     my $self = shift;
662    
663     my ($name,$delimiter,$data) = @_;
664    
665     my $log = $self->_get_logger();
666    
667     if (! $self->{'import_xml'}->{'format'}->{$name}) {
668     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
669     return $data;
670     }
671    
672     $log->warn("no delimiter for format $name") if (! $delimiter);
673    
674     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
675    
676     my @data = split(/\Q$delimiter\E/, $data);
677    
678     my $out = sprintf($format, @data);
679     $log->debug("using format $name [$format] on $data to produce: $out");
680    
681     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
682 dpavlin 31 return $self->{'lookup'}->lookup($out);
683 dpavlin 15 } else {
684     return $out;
685     }
686    
687     }
688    
689 dpavlin 13 =head2 sort_arr
690    
691     Sort array ignoring case and html in data
692    
693     my @sorted = $webpac->sort_arr(@unsorted);
694    
695     =cut
696    
697     sub sort_arr {
698     my $self = shift;
699    
700     my $log = $self->_get_logger();
701    
702     # FIXME add Schwartzian Transformation?
703    
704     my @sorted = sort {
705     $a =~ s#<[^>]+/*>##;
706     $b =~ s#<[^>]+/*>##;
707     lc($b) cmp lc($a)
708     } @_;
709     $log->debug("sorted values: ",sub { join(", ",@sorted) });
710    
711     return @sorted;
712     }
713    
714    
715 dpavlin 15 =head1 INTERNAL METHODS
716    
717 dpavlin 13 =head2 _sort_by_order
718    
719     Sort xml tags data structure accoding to C<order=""> attribute.
720    
721     =cut
722    
723     sub _sort_by_order {
724     my $self = shift;
725    
726     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
727     $self->{'import_xml'}->{'indexer'}->{$a};
728     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
729     $self->{'import_xml'}->{'indexer'}->{$b};
730    
731     return $va <=> $vb;
732     }
733    
734     =head2 _x
735    
736 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
737     specific encoding (optinally specified using C<code_page> to C<new>
738     constructor).
739 dpavlin 13
740     my $text = $n->_x('normalize text string');
741    
742     This is a stub so that other modules doesn't have to implement it.
743    
744     =cut
745    
746     sub _x {
747     my $self = shift;
748     return shift;
749     }
750    
751    
752 dpavlin 10 =head1 AUTHOR
753    
754     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
755    
756     =head1 COPYRIGHT & LICENSE
757    
758     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
759    
760     This program is free software; you can redistribute it and/or modify it
761     under the same terms as Perl itself.
762    
763     =cut
764    
765 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26