--- trunk/run.pl 2006/05/15 17:49:01 511 +++ trunk/run.pl 2006/05/22 19:34:45 529 @@ -14,6 +14,7 @@ use WebPAC::Normalize::XML; use WebPAC::Normalize::Set; use WebPAC::Output::TT; +use WebPAC::Validate; use YAML qw/LoadFile/; use Getopt::Long; use File::Path; @@ -63,6 +64,10 @@ disable indexing and dump statistics about field and subfield usage for each input +=item --validate path/to/validation_file + +turn on extra validation of imput records, see L + =back =cut @@ -76,6 +81,7 @@ my $only_filter; my $force_set = 0; my $stats = 0; +my $validate_path; GetOptions( "limit=i" => \$limit, @@ -87,6 +93,7 @@ "debug" => \$debug, "force-set" => \$force_set, "stats" => \$stats, + "validate=s" => \$validate_path, ); $config = LoadFile($config); @@ -98,6 +105,11 @@ my $log = _new WebPAC::Common()->_get_logger(); $log->info( "-" x 79 ); +my $validate; +$validate = new WebPAC::Validate( + path => $validate_path, +) if ($validate_path); + my $use_indexer = $config->{use_indexer} || 'hyperestraier'; if ($stats) { $log->debug("option --stats disables update of indexing engine..."); @@ -109,13 +121,14 @@ my $total_rows = 0; my $start_t = time(); +my @links; +my $indexer; + while (my ($database, $db_config) = each %{ $config->{databases} }) { - my ($only_database,$only_input) = split(m#/#, $only_filter); + my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter); next if ($only_database && $database !~ m/$only_database/i); - my $indexer; - if ($use_indexer) { my $indexer_config = $config->{$use_indexer} || $log->logdie("can't find '$use_indexer' part in confguration"); $indexer_config->{database} = $database; @@ -182,7 +195,7 @@ foreach my $input (@inputs) { - next if ($only_input && $input->{name} =~ m#$only_input#i || $input->{type} =~ m#$only_input#i); + next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i)); my $type = lc($input->{type}); @@ -190,11 +203,13 @@ my $lookup = new WebPAC::Lookup( lookup_file => $input->{lookup}, - ); + ) if ($input->{lookup}); my $input_module = $config->{webpac}->{inputs}->{$type}; - $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module lookup '$input->{lookup}'"); + $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module", + $input->{lookup} ? "lookup '$input->{lookup}'" : "" + ); my $input_db = new WebPAC::Input( module => $input_module, @@ -210,12 +225,13 @@ my $maxmfn = $input_db->open( path => $input->{path}, code_page => $input->{encoding}, # database encoding + %{ $input }, ); my $n = new WebPAC::Normalize::XML( # filter => { 'foo' => sub { shift } }, db => $db, - lookup_regex => $lookup->regex, + lookup_regex => $lookup ? $lookup->regex : undef, lookup => $lookup, prefix => $input->{name}, ); @@ -262,6 +278,12 @@ push @{ $row->{'000'} }, $pos; } + + if ($validate) { + my @errors = $validate->validate_errors( $row ); + $log->error( "MFN $mfn validation errors:\n", join("\n", @errors) ) if (@errors); + } + my $ds; if ($n) { @@ -270,7 +292,7 @@ $ds = WebPAC::Normalize::Set::data_structure( row => $row, rules => $rules, - lookup => $lookup->lookup_hash, + lookup => $lookup ? $lookup->lookup_hash : undef, ); $db->save_ds( @@ -308,12 +330,12 @@ if (ref($db_config->{links}) eq 'ARRAY') { foreach my $link (@{ $db_config->{links} }) { if ($use_indexer eq 'hyperestraier') { - $log->info("adding link $database -> $link->{to} [$link->{credit}]"); - $indexer->add_link( + $log->info("saving link $database -> $link->{to} [$link->{credit}]"); + push @links, { from => $database, to => $link->{to}, credit => $link->{credit}, - ); + }; } else { $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]"); } @@ -322,3 +344,7 @@ } +foreach my $link (@links) { + $log->info("adding link $link->{from} -> $link->{to} [$link->{credit}]"); + $indexer->add_link( %{ $link } ); +}