/[webpac2]/trunk/run.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/run.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 698 - (hide annotations)
Mon Sep 25 11:14:53 2006 UTC (17 years, 7 months ago) by dpavlin
File MIME type: text/plain
File size: 13073 byte(s)
 r990@llin:  dpavlin | 2006-09-25 13:12:42 +0200
 new depends method to track dependencies, input in most places can be input name or
 hash with key 'name' which will be used as input (for exaple, from configuration file),
 database and input names will have correctly stripped quotes,
 begin removal of old lookup support

1 dpavlin 74 #!/usr/bin/perl -w
2    
3     use strict;
4    
5     use Cwd qw/abs_path/;
6     use File::Temp qw/tempdir/;
7     use lib './lib';
8    
9 dpavlin 255 use WebPAC::Common 0.02;
10 dpavlin 698 use WebPAC::Parser 0.04;
11 dpavlin 588 use WebPAC::Lookup 0.03;
12 dpavlin 619 use WebPAC::Input 0.11;
13 dpavlin 209 use WebPAC::Store 0.03;
14 dpavlin 581 use WebPAC::Normalize 0.11;
15 dpavlin 74 use WebPAC::Output::TT;
16 dpavlin 652 use WebPAC::Validate 0.06;
17 dpavlin 578 use WebPAC::Output::MARC;
18 dpavlin 684 use WebPAC::Config;
19 dpavlin 301 use Getopt::Long;
20     use File::Path;
21 dpavlin 389 use Time::HiRes qw/time/;
22 dpavlin 492 use File::Slurp;
23 dpavlin 556 use Data::Dump qw/dump/;
24 dpavlin 595 use Storable qw/dclone/;
25 dpavlin 74
26 dpavlin 606 use Proc::Queue size => 1;
27     use POSIX ":sys_wait_h"; # imports WNOHANG
28    
29 dpavlin 301 =head1 NAME
30 dpavlin 76
31 dpavlin 301 run.pl - start WebPAC indexing
32 dpavlin 141
33 dpavlin 301 B<this command will probably go away. Don't get used to it!>
34 dpavlin 141
35 dpavlin 301 Options:
36    
37     =over 4
38    
39     =item --offset 42
40    
41     start loading (all) databases at offset 42
42    
43     =item --limit 100
44    
45     limit loading to 100 records
46    
47     =item --clean
48    
49     remove database and Hyper Estraier index before indexing
50    
51 dpavlin 510 =item --only=database_name/input_filter
52 dpavlin 335
53 dpavlin 423 reindex just single database (legacy name is --one)
54 dpavlin 335
55 dpavlin 510 C</input_filter> is optional part which can be C<name>
56     or C<type> from input
57    
58 dpavlin 301 =item --config conf/config.yml
59    
60     path to YAML configuration file
61    
62 dpavlin 507 =item --stats
63    
64 dpavlin 638 disable indexing, modify_* in configuration and dump statistics about field
65     and subfield usage for each input
66 dpavlin 507
67 dpavlin 516 =item --validate path/to/validation_file
68    
69     turn on extra validation of imput records, see L<WebPAC::Validation>
70    
71 dpavlin 552 =item --marc-normalize conf/normalize/mapping.pl
72    
73     This option specifies normalisation file for MARC creation
74    
75     =item --marc-output out/marc/test.marc
76    
77     Optional path to output file
78    
79 dpavlin 556 =item --marc-lint
80    
81     By default turned on if C<--marc-normalize> is used. You can disable lint
82 dpavlin 558 messages with C<--no-marc-lint>.
83 dpavlin 556
84 dpavlin 559 =item --marc-dump
85    
86     Force dump or input and marc record for debugging.
87    
88 dpavlin 606 =item --parallel 4
89    
90     Run databases in parallel (aproximatly same as number of processors in
91     machine if you want to use full load)
92    
93 dpavlin 607 =item --only-links
94    
95     Create just links
96    
97 dpavlin 608 =item --merge
98    
99     Create merged index of databases which have links
100    
101 dpavlin 301 =back
102    
103     =cut
104    
105     my $offset;
106     my $limit;
107    
108     my $clean = 0;
109 dpavlin 684 my $config_path;
110 dpavlin 301 my $debug = 0;
111 dpavlin 510 my $only_filter;
112 dpavlin 507 my $stats = 0;
113 dpavlin 516 my $validate_path;
114 dpavlin 552 my ($marc_normalize, $marc_output);
115 dpavlin 556 my $marc_lint = 1;
116 dpavlin 559 my $marc_dump = 0;
117 dpavlin 606 my $parallel = 0;
118 dpavlin 607 my $only_links = 0;
119 dpavlin 608 my $merge = 0;
120 dpavlin 606
121 dpavlin 611 my $log = _new WebPAC::Common()->_get_logger();
122    
123 dpavlin 301 GetOptions(
124     "limit=i" => \$limit,
125     "offset=i" => \$offset,
126     "clean" => \$clean,
127 dpavlin 510 "one=s" => \$only_filter,
128     "only=s" => \$only_filter,
129 dpavlin 684 "config" => \$config_path,
130 dpavlin 560 "debug+" => \$debug,
131 dpavlin 507 "stats" => \$stats,
132 dpavlin 516 "validate=s" => \$validate_path,
133 dpavlin 552 "marc-normalize=s" => \$marc_normalize,
134     "marc-output=s" => \$marc_output,
135 dpavlin 556 "marc-lint!" => \$marc_lint,
136 dpavlin 559 "marc-dump!" => \$marc_dump,
137 dpavlin 606 "parallel=i" => \$parallel,
138 dpavlin 607 "only-links!" => \$only_links,
139 dpavlin 608 "merge" => \$merge,
140 dpavlin 301 );
141    
142 dpavlin 684 my $config = new WebPAC::Config( path => $config_path );
143 dpavlin 301
144 dpavlin 611 #print "config = ",dump($config) if ($debug);
145 dpavlin 301
146 dpavlin 684 die "no databases in config file!\n" unless ($config->databases);
147 dpavlin 431
148 dpavlin 509 $log->info( "-" x 79 );
149 dpavlin 431
150 dpavlin 608
151     my $estcmd_fh;
152     my $estcmd_path = './estcmd-merge.sh';
153     if ($merge) {
154     open($estcmd_fh, '>', $estcmd_path) || $log->logdie("can't open $estcmd_path: $!");
155     print $estcmd_fh 'cd /data/estraier/_node/ || exit 1',$/;
156     print $estcmd_fh 'sudo /etc/init.d/hyperestraier stop',$/;
157     $log->info("created merge batch file $estcmd_path");
158     }
159    
160    
161 dpavlin 516 my $validate;
162     $validate = new WebPAC::Validate(
163     path => $validate_path,
164     ) if ($validate_path);
165    
166 dpavlin 608
167 dpavlin 684 my $use_indexer = $config->use_indexer;
168 dpavlin 509 if ($stats) {
169     $log->debug("option --stats disables update of indexing engine...");
170     $use_indexer = undef;
171     } else {
172     $log->info("using $use_indexer indexing engine...");
173     }
174 dpavlin 141
175 dpavlin 552 # disable indexing when creating marc
176     $use_indexer = undef if ($marc_normalize);
177    
178 dpavlin 698 # parse normalize files and create source files for lookup and normalization
179    
180     my $parser = new WebPAC::Parser( config => $config );
181    
182 dpavlin 213 my $total_rows = 0;
183 dpavlin 389 my $start_t = time();
184 dpavlin 213
185 dpavlin 529 my @links;
186    
187 dpavlin 606 if ($parallel) {
188     $log->info("Using $parallel processes for speedup");
189     Proc::Queue::size($parallel);
190     }
191    
192 dpavlin 684 while (my ($database, $db_config) = each %{ $config->databases }) {
193 dpavlin 141
194 dpavlin 512 my ($only_database,$only_input) = split(m#/#, $only_filter) if ($only_filter);
195 dpavlin 510 next if ($only_database && $database !~ m/$only_database/i);
196 dpavlin 335
197 dpavlin 606 if ($parallel) {
198     my $f=fork;
199     if(defined ($f) and $f==0) {
200     $log->info("Created processes $$ for speedup");
201     } else {
202     next;
203     }
204     }
205    
206 dpavlin 607 my $indexer;
207 dpavlin 509 if ($use_indexer) {
208 dpavlin 627
209     my $cfg_name = $use_indexer;
210     $cfg_name =~ s/\-.*$//;
211    
212 dpavlin 684 my $indexer_config = $config->get( $cfg_name ) || $log->logdie("can't find '$cfg_name' part in confguration");
213 dpavlin 509 $indexer_config->{database} = $database;
214     $indexer_config->{clean} = $clean;
215     $indexer_config->{label} = $db_config->{name};
216 dpavlin 431
217 dpavlin 608 # force clean if database has links
218     $indexer_config->{clean} = 1 if ($db_config->{links});
219    
220 dpavlin 509 if ($use_indexer eq 'hyperestraier') {
221 dpavlin 255
222 dpavlin 509 # open Hyper Estraier database
223     use WebPAC::Output::Estraier '0.10';
224     $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } );
225    
226 dpavlin 627 } elsif ($use_indexer eq 'hyperestraier-native') {
227    
228     # open Hyper Estraier database
229     use WebPAC::Output::EstraierNative;
230     $indexer = new WebPAC::Output::EstraierNative( %{ $indexer_config } );
231    
232 dpavlin 509 } elsif ($use_indexer eq 'kinosearch') {
233 dpavlin 430
234 dpavlin 509 # open KinoSearch
235     use WebPAC::Output::KinoSearch;
236     $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path});
237     $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } );
238 dpavlin 431
239 dpavlin 509 } else {
240     $log->logdie("unknown use_indexer: $use_indexer");
241     }
242    
243     $log->logide("can't continue without valid indexer") unless ($indexer);
244 dpavlin 430 }
245    
246    
247 dpavlin 255 #
248 dpavlin 607 # store Hyper Estraier links to other databases
249     #
250     if (ref($db_config->{links}) eq 'ARRAY' && $use_indexer) {
251     foreach my $link (@{ $db_config->{links} }) {
252     if ($use_indexer eq 'hyperestraier') {
253 dpavlin 608 if ($merge) {
254     print $estcmd_fh 'sudo -u www-data estcmd merge ' . $database . ' ' . $link->{to},$/;
255     } else {
256     $log->info("saving link $database -> $link->{to} [$link->{credit}]");
257     push @links, sub {
258     $log->info("adding link $database -> $link->{to} [$link->{credit}]");
259     $indexer->add_link(
260     from => $database,
261     to => $link->{to},
262     credit => $link->{credit},
263     );
264     };
265     }
266 dpavlin 607 } else {
267     $log->warn("NOT IMPLEMENTED WITH $use_indexer: adding link $database -> $link->{to} [$link->{credit}]");
268     }
269     }
270     }
271     next if ($only_links);
272    
273    
274     #
275 dpavlin 255 # now WebPAC::Store
276     #
277 dpavlin 210 my $abs_path = abs_path($0);
278     $abs_path =~ s#/[^/]*$#/#;
279 dpavlin 141
280 dpavlin 684 my $db_path = $config->get('webpac')->{db_path} . '/' . $database;
281 dpavlin 74
282 dpavlin 301 if ($clean) {
283 dpavlin 511 $log->info("creating new database '$database' in $db_path");
284 dpavlin 301 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
285     } else {
286 dpavlin 511 $log->info("working on database '$database' in $db_path");
287 dpavlin 301 }
288 dpavlin 255
289 dpavlin 210 my $db = new WebPAC::Store(
290     path => $db_path,
291 dpavlin 217 database => $database,
292 dpavlin 301 debug => $debug,
293 dpavlin 210 );
294 dpavlin 74
295 dpavlin 233
296 dpavlin 213 #
297     # now, iterate through input formats
298     #
299 dpavlin 74
300 dpavlin 213 my @inputs;
301     if (ref($db_config->{input}) eq 'ARRAY') {
302     @inputs = @{ $db_config->{input} };
303 dpavlin 255 } elsif ($db_config->{input}) {
304     push @inputs, $db_config->{input};
305 dpavlin 213 } else {
306 dpavlin 255 $log->info("database $database doesn't have inputs defined");
307 dpavlin 213 }
308 dpavlin 74
309 dpavlin 213 foreach my $input (@inputs) {
310 dpavlin 233
311 dpavlin 512 next if ($only_input && ($input->{name} !~ m#$only_input#i && $input->{type} !~ m#$only_input#i));
312 dpavlin 510
313 dpavlin 233 my $type = lc($input->{type});
314    
315 dpavlin 684 die "I know only how to handle input types ", join(",", $config->webpac('inputs') ), " not '$type'!\n" unless (grep(/$type/, $config->webpac('inputs')));
316 dpavlin 233
317 dpavlin 684 my $input_module = $config->webpac('inputs')->{$type};
318 dpavlin 74
319 dpavlin 523 $log->info("working on input '$input->{name}' in $input->{path} [type: $input->{type}] using $input_module",
320     $input->{lookup} ? "lookup '$input->{lookup}'" : ""
321     );
322 dpavlin 286
323 dpavlin 638 if ($stats) {
324     # disable modification of records if --stats is in use
325     delete($input->{modify_records});
326     delete($input->{modify_file});
327     }
328    
329 dpavlin 698 warn "depends on: ", dump( $parser->depends($database, $input->{name}), $parser->{depends}, $parser->lookup_create_rules($database, $input->{name}), $parser->{_lookup_create} );
330    
331     my $lookup;
332    
333 dpavlin 287 my $input_db = new WebPAC::Input(
334     module => $input_module,
335 dpavlin 684 encoding => $config->webpac('webpac_encoding'),
336 dpavlin 301 limit => $limit || $input->{limit},
337     offset => $offset,
338 dpavlin 585 lookup_coderef => sub {
339     my $rec = shift || return;
340     $lookup->add( $rec );
341     },
342 dpavlin 416 recode => $input->{recode},
343 dpavlin 507 stats => $stats,
344 dpavlin 600 modify_records => $input->{modify_records},
345 dpavlin 636 modify_file => $input->{modify_file},
346 dpavlin 287 );
347 dpavlin 286 $log->logdie("can't create input using $input_module") unless ($input);
348 dpavlin 113
349 dpavlin 287 my $maxmfn = $input_db->open(
350 dpavlin 285 path => $input->{path},
351 dpavlin 213 code_page => $input->{encoding}, # database encoding
352 dpavlin 523 %{ $input },
353 dpavlin 213 );
354 dpavlin 113
355 dpavlin 671 my $report_fh;
356     if ($stats || $validate) {
357     my $path = "out/report/" . $database . '-' . $input->{name} . '.txt';
358     open($report_fh, '>', $path) || $log->logdie("can't open $path: $!");
359    
360     print $report_fh "Report for database '$database' input '$input->{name}' records ",
361     $offset || 1, "-", $limit || $input->{limit} || $maxmfn, "\n\n";
362     $log->info("Generating report file $path");
363     }
364    
365 dpavlin 539 my @norm_array = ref($input->{normalize}) eq 'ARRAY' ?
366     @{ $input->{normalize} } : ( $input->{normalize} );
367 dpavlin 210
368 dpavlin 552 if ($marc_normalize) {
369     @norm_array = ( {
370     path => $marc_normalize,
371     output => $marc_output || 'out/marc/' . $database . '-' . $input->{name} . '.marc',
372     } );
373     }
374    
375 dpavlin 539 foreach my $normalize (@norm_array) {
376 dpavlin 492
377 dpavlin 539 my $normalize_path = $normalize->{path} || $log->logdie("can't find normalize path in config");
378 dpavlin 269
379 dpavlin 539 $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
380 dpavlin 210
381 dpavlin 539 my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
382 dpavlin 213
383 dpavlin 539 $log->info("Using $normalize_path for normalization...");
384 dpavlin 213
385 dpavlin 578 my $marc = new WebPAC::Output::MARC(
386     path => $normalize->{output},
387     lint => $marc_lint,
388     dump => $marc_dump,
389     ) if ($normalize->{output});
390 dpavlin 540
391 dpavlin 539 # reset position in database
392     $input_db->seek(1);
393 dpavlin 290
394 dpavlin 674 # generate name of config key for indexer (strip everything after -)
395     my $indexer_config = $use_indexer;
396 dpavlin 675 $indexer_config =~ s/^(\w+)-?.*$/$1/g if ($indexer_config);
397 dpavlin 674
398 dpavlin 539 foreach my $pos ( 0 ... $input_db->size ) {
399 dpavlin 516
400 dpavlin 539 my $row = $input_db->fetch || next;
401 dpavlin 516
402 dpavlin 539 my $mfn = $row->{'000'}->[0];
403 dpavlin 213
404 dpavlin 539 if (! $mfn || $mfn !~ m#^\d+$#) {
405     $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
406     $mfn = $pos;
407     push @{ $row->{'000'} }, $pos;
408     }
409 dpavlin 508
410 dpavlin 213
411 dpavlin 539 if ($validate) {
412 dpavlin 653 if ( my $errors = $validate->validate_errors( $row, $input_db->dump ) ) {
413     $log->error( "MFN $mfn validation error:\n",
414 dpavlin 671 $validate->report_error( $errors )
415 dpavlin 652 );
416     }
417 dpavlin 539 }
418 dpavlin 213
419 dpavlin 595 my $ds_config = dclone($db_config);
420    
421     # default values -> database key
422     $ds_config->{_} = $database;
423    
424     # current mfn
425     $ds_config->{_mfn} = $mfn;
426    
427     # attach current input
428     $ds_config->{input} = $input;
429    
430 dpavlin 539 my $ds = WebPAC::Normalize::data_structure(
431     row => $row,
432     rules => $rules,
433     lookup => $lookup ? $lookup->lookup_hash : undef,
434 dpavlin 595 config => $ds_config,
435 dpavlin 541 marc_encoding => 'utf-8',
436 dpavlin 539 );
437 dpavlin 507
438 dpavlin 539 $db->save_ds(
439     id => $mfn,
440     ds => $ds,
441     prefix => $input->{name},
442     ) if ($ds && !$stats);
443 dpavlin 210
444 dpavlin 539 $indexer->add(
445     id => $input->{name} . "/" . $mfn,
446     ds => $ds,
447 dpavlin 684 type => $config->get($indexer_config)->{type},
448 dpavlin 540 ) if ($indexer && $ds);
449 dpavlin 539
450 dpavlin 580 if ($marc) {
451     my $i = 0;
452 dpavlin 559
453 dpavlin 580 while (my $fields = WebPAC::Normalize::_get_marc_fields( fetch_next => 1 ) ) {
454     $marc->add(
455     id => $mfn . ( $i ? "/$i" : '' ),
456     fields => $fields,
457     leader => WebPAC::Normalize::marc_leader(),
458     row => $row,
459     );
460     $i++;
461     }
462    
463     $log->info("Created $i instances of MFN $mfn\n") if ($i > 1);
464     }
465    
466 dpavlin 539 $total_rows++;
467     }
468    
469 dpavlin 664 if ($validate) {
470     my $errors = $validate->report;
471 dpavlin 671 if ($errors) {
472     $log->info("validation errors:\n$errors\n" );
473     print $report_fh "$errors\n" if ($report_fh);
474     }
475 dpavlin 657 }
476    
477 dpavlin 671 if ($stats) {
478     my $s = $input_db->stats;
479     $log->info("statistics of fields usage:\n$s");
480     print $report_fh "Statistics of fields usage:\n$s" if ($report_fh);
481     }
482 dpavlin 539
483 dpavlin 540 # close MARC file
484 dpavlin 578 $marc->finish if ($marc);
485 dpavlin 539
486 dpavlin 671 # close report
487 dpavlin 672 close($report_fh) if ($report_fh)
488 dpavlin 540 }
489    
490 dpavlin 539 }
491    
492 dpavlin 509 eval { $indexer->finish } if ($indexer && $indexer->can('finish'));
493 dpavlin 434
494 dpavlin 389 my $dt = time() - $start_t;
495 dpavlin 511 $log->info("$total_rows records ", $indexer ? "indexed " : "",
496     sprintf("in %.2f sec [%.2f rec/sec]",
497 dpavlin 389 $dt, ($total_rows / $dt)
498     )
499     );
500 dpavlin 255
501    
502 dpavlin 606 # end forked process
503     if ($parallel) {
504     $log->info("parallel process $$ finished");
505     exit(0);
506     }
507    
508 dpavlin 210 }
509    
510 dpavlin 606 if ($parallel) {
511     # wait all children to finish
512     sleep(1) while wait != -1;
513     $log->info("all parallel processes finished");
514     }
515    
516 dpavlin 608 #
517     # handle links or merge after indexing
518     #
519 dpavlin 607
520 dpavlin 608 if ($merge) {
521     print $estcmd_fh 'sudo /etc/init.d/hyperestraier start',$/;
522     close($estcmd_fh);
523     chmod 0700, $estcmd_path || $log->warn("can't chmod 0700 $estcmd_path: $!");
524     system $estcmd_path;
525     } else {
526     foreach my $link (@links) {
527     $log->logdie("coderef in link ", Dumper($link), " is ", ref($link), " and not CODE") unless (ref($link) eq 'CODE');
528     $link->();
529     }
530 dpavlin 529 }

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26