/[webpac2]/trunk/lib/WebPAC/Parser.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 976 - (hide annotations)
Sat Nov 3 12:30:43 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 11729 byte(s)
if --only filter is used, we will parse just normalization files for this database.

1 dpavlin 686 package WebPAC::Parser;
2    
3     use warnings;
4     use strict;
5    
6    
7     use PPI;
8     use PPI::Dumper;
9     use Data::Dump qw/dump/;
10     use File::Slurp;
11    
12 dpavlin 706 use base qw/WebPAC::Common/;
13 dpavlin 686
14     =head1 NAME
15    
16 dpavlin 737 WebPAC::Parser - parse perl normalization configuration files (rules) and mungle it
17 dpavlin 686
18     =head1 VERSION
19    
20 dpavlin 755 Version 0.08
21 dpavlin 686
22     =cut
23    
24 dpavlin 755 our $VERSION = '0.08';
25 dpavlin 686
26     =head1 SYNOPSIS
27    
28 dpavlin 698 This module will parse L<WebPAC::Normalize/lookup> directives and generate source
29 dpavlin 737 to produce lookups and normalization. It will also parse other parts of
30     source to produce some of DWIM (I<Do What I Mean>) magic
31     (like producing MARC oputput using L<WebPAC::Output::MARC> if there are C<marc_*>
32     rules in normalisation).
33 dpavlin 698
34     It's written using L<PPI>, pure-perl parser for perl and heavily influenced by
35     reading about LISP. It might be a bit over-the board, but at least it removed
36     separate configuration files for lookups.
37    
38     This is experimental code, but it replaces all older formats which where,
39     at one point in time, available in WebPAC.
40    
41 dpavlin 686 FIXME
42    
43     =head1 FUNCTIONS
44    
45     =head2 new
46    
47     Create new parser object.
48    
49     my $parser = new WebPAC::Parser(
50     config => new WebPAC::Config(),
51     base_path => '/optional/path/to/conf',
52 dpavlin 976 only_database => $only
53 dpavlin 686 );
54    
55     =cut
56    
57     sub new {
58     my $class = shift;
59     my $self = {@_};
60     bless($self, $class);
61    
62     my $log = $self->_get_logger();
63    
64     $log->logdie("config isn't WebPAC::Config") unless ($self->{config} && $self->{config}->isa('WebPAC::Config'));
65    
66     $log->logdie("can't iterate_inputs over this config object") unless ($self->{config}->can('iterate_inputs'));
67    
68 dpavlin 703 $self->_read_sources;
69 dpavlin 686
70 dpavlin 691 $self ? return $self : return undef;
71     }
72    
73 dpavlin 703 =head2 valid_database
74    
75     my $ok = $parse->valid_database('key');
76    
77     =cut
78    
79     sub valid_database {
80     my $self = shift;
81    
82     my $database = shift || return;
83    
84     return defined($self->{valid_inputs}->{ _q($database) });
85     }
86    
87     =head2 valid_database_input
88    
89     my $ok = $parse->valid_database('database_key','input_name');
90    
91     =cut
92    
93     sub valid_database_input {
94     my $self = shift;
95     my ($database,$input) = @_;
96 dpavlin 705 $input = _input_name($input);
97 dpavlin 703 return defined($self->{valid_inputs}->{ _q($database) }->{ _q($input) });
98     }
99    
100     =head2 depends
101    
102     Return all databases and inputs on which specified one depends
103    
104     $depends_on = $parser->depends('database','input');
105    
106     =cut
107    
108     sub depends {
109     my $self = shift;
110     my ($database,$input) = @_;
111 dpavlin 705 $input = _input_name($input);
112 dpavlin 703 $self->_get_logger->debug("depends($database,$input)");
113     return unless (
114     defined( $self->{depends}->{ _q($database) } ) &&
115     defined( $self->{depends}->{ _q($database) }->{ _q($input) } )
116     );
117     return $self->{depends}->{ _q($database) }->{ _q($input) };
118     }
119    
120 dpavlin 706 =head2 have_lookup_create
121    
122     my @keys = $parser->have_lookup_create($database, $input);
123    
124     =cut
125    
126     sub have_lookup_create {
127     my $self = shift;
128     my ($database,$input) = @_;
129     $input = _input_name($input);
130     return unless (
131     defined( $self->{_lookup_create_key}->{ _q($database) } ) &&
132     defined( $self->{_lookup_create_key}->{ _q($database) }->{ _q($input) } )
133     );
134     return keys %{ $self->{_lookup_create_key}->{ _q($database) }->{ _q($input) } };
135     }
136    
137    
138 dpavlin 705 =head2 lookup_create_rules
139    
140     my $source = $parser->lookup_create_rules($database, $input);
141    
142     =cut
143    
144     sub lookup_create_rules {
145     my $self = shift;
146     my ($database,$input) = @_;
147     $input = _input_name($input);
148     return unless (
149     defined( $self->{_lookup_create}->{ _q($database) } ) &&
150     defined( $self->{_lookup_create}->{ _q($database) }->{ _q($input) } )
151     );
152     return $self->{_lookup_create}->{ _q($database) }->{ _q($input) };
153     }
154    
155 dpavlin 717 =head2 normalize_rules
156    
157     my $source = $parser->normalize_rules($database, $input);
158    
159     =cut
160    
161     sub normalize_rules {
162     my $self = shift;
163     my ($database,$input) = @_;
164     $input = _input_name($input);
165     return unless (
166     defined( $self->{_normalize_source}->{ _q($database) } ) &&
167     defined( $self->{_normalize_source}->{ _q($database) }->{ _q($input) } )
168     );
169     return $self->{_normalize_source}->{ _q($database) }->{ _q($input) };
170     }
171    
172 dpavlin 737
173 dpavlin 755 =head2 have_rules
174 dpavlin 737
175 dpavlin 755 my $do_marc = $parser->have_rules('marc', $database, $input);
176     my $do_index = $parser->have_rules('search', $database);
177 dpavlin 737
178 dpavlin 755 This function will return hash containing count of all found C<marc_*> or
179     C<search> directives. Input name is optional.
180 dpavlin 737
181     =cut
182    
183 dpavlin 755 sub have_rules {
184 dpavlin 737 my $self = shift;
185 dpavlin 755
186     my $log = $self->_get_logger();
187     my $type = shift @_ || $log->logconfess("need at least type");
188     my $database = shift @_ || $log->logconfess("database is required");
189     my $input = shift @_;
190    
191 dpavlin 737 $input = _input_name($input);
192 dpavlin 755
193    
194     return unless defined( $self->{_have_rules}->{ _q($database) } );
195    
196     my $database_rules = $self->{_have_rules}->{ _q($database ) };
197    
198     if (defined($input)) {
199    
200     return unless (
201     defined( $database_rules->{ _q($input) } ) &&
202     defined( $database_rules->{ _q($input) }->{ $type } )
203     );
204    
205     return $database_rules->{ _q($input) }->{ $type };
206     }
207    
208     my $usage;
209    
210     foreach my $i (keys %{ $database_rules }) {
211     next unless defined( $database_rules->{$i}->{$type} );
212    
213     foreach my $t (keys %{ $database_rules->{ $i }->{$type} }) {
214     $usage->{ $t } += $database_rules->{ $i }->{ $t };
215     }
216     }
217    
218     return $usage;
219    
220 dpavlin 737 }
221    
222    
223 dpavlin 703 =head1 PRIVATE
224    
225     =head2 _read_sources
226    
227     my $source_files = $parser->_read_sources;
228    
229 dpavlin 691 Called by L</new>.
230    
231     =cut
232    
233 dpavlin 703 sub _read_sources {
234 dpavlin 691 my $self = shift;
235    
236     my $log = $self->_get_logger();
237    
238     my $nr = 0;
239    
240 dpavlin 737 my @sources;
241 dpavlin 701
242 dpavlin 800 my $lookup_src_cache;
243    
244 dpavlin 976 my $only_database = $self->{only_database};
245     my $only_input = $self->{only_input};
246    
247 dpavlin 691 $self->{config}->iterate_inputs( sub {
248     my ($input, $database) = @_;
249    
250 dpavlin 976 return if ( $only_database && $database !~ m/$only_database/i );
251     return if ( $only_input && $input->{name} !~ m/$only_input/i );
252    
253 dpavlin 699 $log->debug("database: $database input = ", dump($input));
254    
255 dpavlin 701 foreach my $normalize (@{ $input->{normalize} }) {
256 dpavlin 691
257 dpavlin 698 my $path = $normalize->{path};
258     return unless($path);
259     my $full = $self->{base_path} ? $self->{base_path} . '/' . $path : $path;
260 dpavlin 691
261 dpavlin 698 $log->logdie("normalization input $full doesn't exist") unless (-e $full);
262 dpavlin 691
263 dpavlin 698 my $s = read_file( $full ) || $log->logdie("can't read $full: $!");
264 dpavlin 691
265 dpavlin 698 my $input_name = _input_name($input) || $log->logdie("can't deduce name of input: ", dump( $input ));
266 dpavlin 691
267 dpavlin 698 $log->debug("$database/$input_name: adding $path");
268    
269 dpavlin 701 $self->{valid_inputs}->{$database}->{$input_name}++;
270 dpavlin 698
271 dpavlin 737 push @sources, sub {
272 dpavlin 826 #warn "### $database $input_name, $full ###\n";
273 dpavlin 737 $self->_parse_source( $database, $input_name, $full, $s );
274 dpavlin 701 };
275 dpavlin 698
276     $nr++;
277     }
278 dpavlin 686 } );
279    
280 dpavlin 691 $log->debug("found $nr source files");
281 dpavlin 686
282 dpavlin 737 # parse all sources
283     $_->() foreach (@sources);
284 dpavlin 701
285 dpavlin 691 return $nr;
286 dpavlin 686 }
287    
288 dpavlin 737 =head2 _parse_source
289 dpavlin 686
290 dpavlin 737 $parser->_parse_source($database,$input,$path,$source);
291 dpavlin 691
292 dpavlin 708 Called for each normalize source (rules) in each input by L</_read_sources>
293 dpavlin 698
294     It will report invalid databases and inputs in error log after parsing.
295    
296 dpavlin 686 =cut
297    
298 dpavlin 737 sub _parse_source {
299 dpavlin 686 my $self = shift;
300 dpavlin 701 my ($database, $input, $path, $source) = @_;
301 dpavlin 686
302 dpavlin 698 $input = _input_name($input);
303    
304 dpavlin 686 my $log = $self->_get_logger();
305    
306 dpavlin 691 $log->logdie("invalid database $database" ) unless $self->valid_database( $database );
307     $log->logdie("invalid input $input of database $database", ) unless $self->valid_database_input( $database, $input );
308 dpavlin 686
309 dpavlin 691 $log->logdie("no source found for database $database input $input path $path") unless ($source);
310 dpavlin 686
311 dpavlin 701 $log->info("parsing lookups for $database/$input from $path (",length($source)," bytes)");
312 dpavlin 691
313     my $Document = PPI::Document->new( \$source ) || $log->logdie("can't parse source:\n", $self->{source});
314    
315 dpavlin 976 #$Document->prune('PPI::Token::Whitespace');
316 dpavlin 724 $Document->prune('PPI::Token::Comment');
317 dpavlin 686 #$Document->prune('PPI::Token::Operator');
318    
319     # Find all the named subroutines
320    
321 dpavlin 691 $self->{_lookup_errors} = ();
322 dpavlin 686
323 dpavlin 691 sub _lookup_error {
324     my $self = shift;
325     my $msg = shift;
326     $self->_get_logger->logconfess("error without message?") unless ($msg);
327     push @{ $self->{_lookup_errors} }, $msg;
328 dpavlin 690 return '';
329     }
330    
331 dpavlin 686 $Document->find( sub {
332     my ($Document,$Element) = @_;
333    
334     $Element->isa('PPI::Token::Word') or return '';
335     $Element->content eq 'lookup' or return '';
336    
337 dpavlin 687 $log->debug("expansion: ", $Element->snext_sibling);
338 dpavlin 686
339     my $args = $Element->snext_sibling;
340    
341     my @e = $args->child(0)->elements;
342 dpavlin 687 $log->logdie("hum, expect at least 8 elements, got ", scalar @e, " in $args") if ($#e < 8);
343 dpavlin 686
344 dpavlin 687 if ($log->is_debug) {
345     my $report = "found " . scalar @e . " elements:\n";
346 dpavlin 686
347 dpavlin 687 foreach my $i ( 0 .. $#e ) {
348     $report .= sprintf("# %-2d: %-30s %s\n", $i, ( $e[$i] || 'undef' ), $e[$i]->class );
349     }
350    
351     $log->debug($report);
352 dpavlin 686 }
353    
354     my $key_element = $e[8]->clone;
355    
356 dpavlin 687 $log->logdie("key element must be PPI::Structure::Block") unless $key_element->isa('PPI::Structure::Block');
357 dpavlin 686
358 dpavlin 687 $log->debug("key part: ", $key_element);
359 dpavlin 686
360     my @key;
361    
362     $key_element->find( sub {
363     my $e = $_[1] || die "no element?";
364     $e->isa('PPI::Token::Word') or return '';
365     $e->content eq 'rec' or return '';
366    
367     my $kf = $e->snext_sibling;
368    
369 dpavlin 687 $log->debug("key fragment = $kf");
370 dpavlin 686
371     push @key, eval $kf;
372 dpavlin 687 $log->logdie("can't eval { $kf }: $@") if ($@);
373 dpavlin 686
374     return 1;
375     });
376    
377 dpavlin 687 my $key = join('-', @key ) || $log->logdie("no key found!");
378 dpavlin 686
379 dpavlin 687 $log->debug("key = $key");
380 dpavlin 686
381 dpavlin 712 return $self->_lookup_error("invalid database $e[3] in $path" ) unless $self->valid_database( $e[3] );
382     return $self->_lookup_error("invalid input $e[5] of database $e[3] in $path", ) unless $self->valid_database_input( $e[3], $e[5] );
383 dpavlin 686
384 dpavlin 712 my $create = qq{
385     save_into_lookup($e[3],$e[5],'$key', $e[7] $e[8] );
386     };
387    
388 dpavlin 687 $log->debug("create: $create");
389 dpavlin 686
390 dpavlin 698 # save code to create this lookup
391 dpavlin 706 $self->{_lookup_create}->{ _q($e[3]) }->{ _q($e[5]) } .= $create;
392     $self->{_lookup_create_key}->{ _q($e[3]) }->{ _q($e[5]) }->{ _q($key) }++;
393 dpavlin 686
394 dpavlin 698
395 dpavlin 702 if (defined( $self->{depends}->{ $database }->{ $input }->{ _q($e[3]) }->{ _q($e[5]) }->{ $key } )) {
396     $log->warn("dependency of $database/$input on $e[3]/$e[5] with $key allready recorded");
397 dpavlin 698 }
398    
399     # save this dependency
400 dpavlin 702 $self->{depends}->{ $database }->{ $input }->{ _q($e[3]) }->{ _q($e[5]) }->{$key}++;
401 dpavlin 698
402 dpavlin 686 if ($#e < 10) {
403     $e[8]->insert_after( $e[8]->clone );
404     $e[8]->insert_after( $e[7]->clone );
405     $e[8]->insert_after( $e[6]->clone );
406     }
407    
408     $e[7]->remove;
409     $e[8]->insert_before( new PPI::Token::Quote::Single( "'$key'" ) );
410     $e[8]->remove;
411    
412    
413 dpavlin 687 $log->debug(">>> ", $Element->snext_sibling);
414 dpavlin 686 });
415    
416 dpavlin 693 my $normalize_source = $Document->serialize;
417 dpavlin 692 $log->debug("create: ", dump($self->{_lookup_create}) );
418 dpavlin 693 $log->debug("normalize: $normalize_source");
419 dpavlin 686
420 dpavlin 720 $self->{_normalize_source}->{$database}->{$input} .= $normalize_source;
421 dpavlin 692
422 dpavlin 687 if ($self->{debug}) {
423     my $Dumper = PPI::Dumper->new( $Document );
424     $Dumper->print;
425     }
426 dpavlin 686
427 dpavlin 692 $log->error("Parser errors:\n", join("\n",@{ $self->{_lookup_errors} }) ) if ($self->{_lookup_errors});
428 dpavlin 690
429 dpavlin 737 $Document->find( sub {
430     my ($Document,$Element) = @_;
431    
432     $Element->isa('PPI::Token::Word') or return '';
433 dpavlin 755 if ($Element->content =~ m/^(marc|search)/) {
434     my $what = $1;
435     $log->debug("found $what rules in $database/$input");
436     $self->{_have_rules}->{ $database }->{ $input }->{ $what }->{ $Element->content }++;
437     } else {
438     return '';
439     }
440 dpavlin 737 });
441    
442 dpavlin 687 return 1;
443 dpavlin 686 }
444    
445 dpavlin 692
446 dpavlin 698 =head2 _q
447    
448     Strip single or double quotes around value
449    
450     _q(qq/'foo'/) -> foo
451    
452     =cut
453    
454     sub _q {
455     my $v = shift || return;
456     $v =~ s/^['"]*//g;
457     $v =~ s/['"]*$//g;
458     return $v;
459     }
460    
461     =head2 _input_name
462    
463     Return C<name> value if HASH or arg if scalar
464    
465     _input_name($input)
466    
467     =cut
468    
469     sub _input_name {
470     my $input = shift || return;
471     if (ref($input) eq 'HASH') {
472     die "can't find 'name' value in ", dump($input) unless defined($input->{name});
473     return $input->{name};
474     } else {
475     return $input;
476     }
477     }
478    
479    
480 dpavlin 686 =head1 AUTHOR
481    
482     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
483    
484     =head1 COPYRIGHT & LICENSE
485    
486     Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.
487    
488     This program is free software; you can redistribute it and/or modify it
489     under the same terms as Perl itself.
490    
491     =cut
492    
493     1; # End of WebPAC::Parser

  ViewVC Help
Powered by ViewVC 1.1.26