--- trunk/lib/WebPAC/Validate.pm 2006/05/16 13:38:09 514 +++ trunk/lib/WebPAC/Validate.pm 2007/05/24 14:42:40 840 @@ -7,7 +7,10 @@ use base 'WebPAC::Common'; use File::Slurp; -use Data::Dumper; +use List::Util qw/first/; +use Data::Dump qw/dump/; +use WebPAC::Normalize qw/_pack_subfields_hash/; +use Storable qw/dclone/; =head1 NAME @@ -15,11 +18,11 @@ =head1 VERSION -Version 0.01 +Version 0.11 =cut -our $VERSION = '0.01'; +our $VERSION = '0.11'; =head1 SYNOPSIS @@ -31,11 +34,15 @@ # same with 101 101 # field 200 have valid subfields a-g - 200 a b c d e f g + # and field e is repeatable + 200 a b c d e* f g # field 205 can have only subfield a - 205 a + # and must exists + 205! a # while 210 can have a c or d 210 a c d + # field which is ignored in validation + 999- =head1 FUNCTIONS @@ -44,9 +51,15 @@ Create new validation object my $validate = new WebPAC::Validate( - path => '/path/to/input/validate_file', + path => 'conf/validate/file', + delimiters => [ ' : ', ' / ', ' ; ', ' , ' ], + delimiters_path => 'conf/validate/delimiters/file', ); +Optional parametar C will turn on validating of delimiters. Be +careful here, those delimiters are just stuck into regex, so they can +contain L regexpes. + =cut sub new { @@ -56,42 +69,449 @@ my $log = $self->_get_logger(); - foreach my $p (qw/path/) { - $log->logconfess("need $p") unless ($self->{$p}); + $log->logdie("need path or delimiters_path") unless ( $self->{path} || $self->{delimiters_path} ); + + if ( $self->{path} ) { + + my $v_file = read_file( $self->{path} ) || + $log->logdie("can't open validate path $self->{path}: $!"); + + my $v; + my $curr_line = 1; + + foreach my $l (split(/[\n\r]+/, $v_file)) { + $curr_line++; + + # skip comments and whitespaces + next if ($l =~ /^#/ || $l =~ /^\s*$/); + + $l =~ s/^\s+//; + $l =~ s/\s+$//; + + my @d = split(/\s+/, $l); + + my $fld = shift @d; + + if ($fld =~ s/!$//) { + $self->{must_exist}->{$fld}++; + } elsif ($fld =~ s/-$//) { + $self->{dont_validate}->{$fld}++; + } + + $log->logdie("need field name in line $curr_line: $l") unless (defined($fld)); + + if (@d) { + $v->{$fld} = [ map { + my $sf = $_; + if ( $sf =~ s/!(\*)?$/$1/ ) { + $self->{must_exist_sf}->{ $fld }->{ $sf }++; + }; + $sf; + } @d ]; + } else { + $v->{$fld} = 1; + } + + } + + $log->debug("current validation rules: ", dump($v)); + + $self->{rules} = $v; + + $log->info("validation uses rules from $self->{path}"); + } + + if ( $self->{delimiters} ) { + $self->{delimiters_regex} = '(\^[a-z0-9]|' . join('|', @{ $self->{delimiters} }) . ')'; + $log->info("validation check delimiters with regex $self->{delimiters_regex}"); + } + + if ( my $path = $self->{delimiters_path} ) { + if ( -e $path ) { + $log->info("using delimiter validation rules from $path"); + open(my $d, $path) || $log->fatal("can't open $path: $!"); + while(<$d>) { + chomp($d); + if (/^\s*(#*)\s*(\d+)\t+(\d+)\t+(.*)$/) { + my ($comment,$field,$count,$template) = ($1,$2,$3,$4); + $self->{_validate_delimiters_templates}->{$field}->{$template} = $count unless ($comment); + } else { + warn "## ignored $d\n"; + } + } + close($d); + warn "_validate_delimiters_templates = ",dump( $self->{_validate_delimiters_templates} ); + } else { + $log->warn("delimiters path $path doesn't exist, it will be created after this run"); + } + } + + $self ? return $self : return undef; +} + +=head2 validate_rec + +Validate record and return errors + + my @errors = $validate->validate_rec( $rec, $rec_dump ); + +=cut + +sub validate_rec { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift || $log->logdie("validate_rec need record"); + my $rec_dump = shift; + + $log->logdie("rec isn't HASH") unless (ref($rec) eq 'HASH'); +# $log->logdie("can't find validation rules") unless (my $r = $self->{rules}); + my $r = $self->{rules}; + + my $errors; + + $log->debug("rec = ", sub { dump($rec) }, "keys = ", keys %{ $rec }); + + my $fields; + + foreach my $f (keys %{ $rec }) { + + next if (!defined($f) || $f eq '' || $f eq '000'); + + # first check delimiters + if ( my $regex = $self->{delimiters_regex} ) { + + foreach my $v (@{ $rec->{$f} }) { + my $l = _pack_subfields_hash( $v, 1 ); + my $subfield_dump = $l; + my $template = ''; + $l =~ s/$regex/$template.=$1/eg; + #warn "## template: $template\n"; + + if ( $template ) { + $self->{_delimiters_templates}->{$f}->{$template}++; + + if ( my $v = $self->{_validate_delimiters_templates} ) { + if ( ! defined( $v->{$f}->{$template} ) ) { + $errors->{$f}->{invalid_delimiters_combination} = $template; + $errors->{$f}->{dump} = $subfield_dump; + #} else { + # warn "## $f $template ok\n"; + } + } + } + } + } + + next unless ( $r ); # skip validation of no rules are specified + + next if (defined( $self->{dont_validate}->{$f} )); + + # track field usage + $fields->{$f}++; + + if ( ! defined($r->{$f}) ) { + $errors->{ $f }->{unexpected} = "this field is not expected"; + next; + } + + + if (ref($rec->{$f}) ne 'ARRAY') { + $errors->{ $f }->{not_repeatable} = "probably bug in parsing input data"; + next; + } + + foreach my $v (@{ $rec->{$f} }) { + # can we have subfields? + if (ref($r->{$f}) eq 'ARRAY') { + # are values hashes? (has subfields) + if (! defined($v)) { +# $errors->{$f}->{empty} = undef; +# $errors->{dump} = $rec_dump if ($rec_dump); + } elsif (ref($v) ne 'HASH') { + $errors->{$f}->{missing_subfield} = join(",", @{ $r->{$f} }) . " required"; + next; + } else { + + my $h = dclone( $v ); + + my $sf_repeatable; + + delete($v->{subfields}) if (defined($v->{subfields})); + + my $subfields; + + foreach my $sf (keys %{ $v }) { + + $subfields->{ $sf }++; + + # is non-repeatable but with multiple values? + if ( ! first { $_ eq $sf.'*' } @{$r->{$f}} ) { + if ( ref($v->{$sf}) eq 'ARRAY' ) { + $sf_repeatable->{$sf}++; + }; + if (! first { $_ eq $sf } @{ $r->{$f} }) { + $errors->{ $f }->{subfield}->{extra}->{$sf}++; + } + } + + } + if (my @r_sf = sort keys( %$sf_repeatable )) { + + foreach my $sf (@r_sf) { + $errors->{$f}->{subfield}->{extra_repeatable}->{$sf}++; + $errors->{$f}->{dump} = _pack_subfields_hash( $h, 1 ); + } + + } + + if ( defined( $self->{must_exist_sf}->{$f} ) ) { + foreach my $sf (sort keys %{ $self->{must_exist_sf}->{$f} }) { +#warn "====> $f $sf must exist\n"; + $errors->{$f}->{subfield}->{missing}->{$sf}++ + unless defined( $subfields->{$sf} ); + } + } + + } + } elsif (ref($v) eq 'HASH') { + $errors->{$f}->{unexpected_subfields}++; + $errors->{$f}->{dump} = _pack_subfields_hash( $v, 1 ); + } + } } - my $v_file = read_file( $self->{path} ) || - $log->logdie("can't open validate path $self->{path}: $!"); + $log->debug("_delimiters_templates = ", dump( $self->{_delimiters_templates} ) ); - my $v; - my $curr_line = 1; + foreach my $must (sort keys %{ $self->{must_exist} }) { + next if ($fields->{$must}); + $errors->{$must}->{missing}++; + $errors->{dump} = $rec_dump if ($rec_dump); + } - foreach my $l (split(/[\n\r]+/, $v_file)) { - $curr_line++; - # skip comments - next if ($l =~ m/^#/); + if ($errors) { + $log->debug("errors: ", $self->report_error( $errors ) ); - $l =~ s/^\s+//; - $l =~ s/\s+$//; + my $mfn = $rec->{'000'}->[0] || $log->logconfess("record ", dump( $rec ), " doesn't have MFN"); + $self->{errors}->{$mfn} = $errors; + } + + #$log->logcluck("return from this function is ARRAY") unless wantarray; + + return $errors; +} - my @d = split(/\s+/, $l); +=head2 reset_errors - my $fld = shift @d || $log->logdie("need field name in line $curr_line: $l"); +Clean all accumulated errors for this input + + $validate->reset_errors; + +=cut + +sub reset_errors { + my $self = shift; + delete ($self->{errors}); +} + +=head2 all_errors + +Return hash with all errors + + print dump( $validate->all_errors ); + +=cut + +sub all_errors { + my $self = shift; + return $self->{errors}; +} + +=head2 report_error + +Produce nice humanly readable report of single error + + print $validate->report_error( $error_hash ); + +=cut - if (@d) { - $v->{$fld}->{ref} = 'ARRAY'; - $v->{$fld}->{sf} = \@d; +sub report_error { + my $self = shift; + + my $h = shift || die "no hash?"; + + sub _unroll { + my ($self, $tree, $accumulated) = @_; + + my $log = $self->_get_logger(); + + $log->debug("# ", + ( $tree ? "tree: $tree " : '' ), + ( $accumulated ? "accumulated: $accumulated " : '' ), + ); + + my $results; + + if (ref($tree) ne 'HASH') { + return ("$accumulated\t($tree)", undef); + } + + my $dump; + + foreach my $k (sort keys %{ $tree }) { + + if ($k eq 'dump') { + $dump = $tree->{dump}; + #warn "## dump ",dump($dump),"\n"; + next; + } + + $log->debug("current: $k"); + + my ($new_results, $new_dump) = $self->_unroll($tree->{$k}, + $accumulated ? "$accumulated\t$k" : $k + ); + + $log->debug( + ( $new_results ? "new_results: " . dump($new_results) ." " : '' ), + ); + + push @$results, $new_results if ($new_results); + $dump = $new_dump if ($new_dump); + + } + + $log->debug( + ( $results ? "results: " . dump($results) ." " : '' ), + ); + + if ($#$results == 0) { + return ($results->[0], $dump); } else { - $v->{$fld}->{ref} = ''; + return ($results, $dump); } + } + + sub _reformat { + my $l = shift; + $l =~ s/\t/ /g; + $l =~ s/_/ /g; + return $l; } - $log->debug("current validation rules: ", Dumper($v)); + my $out = ''; - $self->{rules} = $v; + for my $f (sort keys %{ $h }) { + $out .= "$f: "; + + my ($r, $d) = $self->_unroll( $h->{$f} ); + my $e; + if (ref($r) eq 'ARRAY') { + $e .= join(", ", map { _reformat( $_ ) } @$r); + } else { + $e .= _reformat( $r ); + } + $e .= "\n\t$d" if ($d); - $self ? return $self : return undef; + $out .= $e . "\n"; + } + return $out; +} + + +=head2 report + +Produce nice humanly readable report of errors + + print $validate->report; + +=cut + +sub report { + my $self = shift; + my $e = $self->{errors} || return; + + my $out; + foreach my $mfn (sort { $a <=> $b } keys %$e) { + $out .= "MFN $mfn\n" . $self->report_error( $e->{$mfn} ) . "\n"; + } + + return $out; + +} + +=head2 delimiters_templates + +Generate report of delimiter tamplates + + my $report = $validate->delimiter_teplates( + report => 1, + ); + +Options: + +=over 4 + +=item report + +Generate humanly readable report with single fields + +=back + +=cut + +sub delimiters_templates { + my $self = shift; + + my $args = {@_}; + + my $t = $self->{_delimiters_templates}; + + my $log = $self->_get_logger; + + unless ($t) { + $log->error("called without delimiters"); + return; + } + + my $out; + + foreach my $f (sort { $a <=> $b } keys %$t) { + $out .= "$f\n" if ( $args->{report} ); + foreach my $template (sort { $a cmp $b } keys %{ $t->{$f} }) { + my $count = $t->{$f}->{$template}; + $out .= + ( $count ? "" : "# " ) . + ( $args->{report} ? "" : "$f" ) . + "\t$count\t$template\n"; + } + } + + return $out; +} + +=head2 save_delimiters_templates + +=cut + +sub save_delimiters_templates { + my $self = shift; + + my $path = $self->{delimiters_path}; + + return unless ( $path ); + + my $log = $self->_get_logger; + + $path .= '.new' if ( -e $path ); + + open(my $d, '>', $path) || $log->fatal("can't open $path: $!"); + print $d $self->delimiters_templates; + close($d); + + $log->info("new delimiters templates saved to $path"); } =head1 AUTHOR