/[webpac2]/trunk/run.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/run.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 301 - (hide annotations)
Mon Dec 19 21:26:04 2005 UTC (18 years, 4 months ago) by dpavlin
File MIME type: text/plain
File size: 4739 byte(s)
 r322@athlon:  dpavlin | 2005-12-19 22:27:06 +0100
 make run.pl moderatly chatty (along with other modules), added command line options
 (try perldoc run.pl) new target index (to reindex all) and run (to index
 first 100 records of each database)

1 dpavlin 74 #!/usr/bin/perl -w
2    
3     use strict;
4    
5     use Cwd qw/abs_path/;
6     use File::Temp qw/tempdir/;
7     use Data::Dumper;
8     use lib './lib';
9    
10 dpavlin 255 use WebPAC::Common 0.02;
11 dpavlin 74 use WebPAC::Lookup;
12 dpavlin 287 use WebPAC::Input 0.03;
13 dpavlin 209 use WebPAC::Store 0.03;
14 dpavlin 74 use WebPAC::Normalize::XML;
15     use WebPAC::Output::TT;
16 dpavlin 255 use WebPAC::Output::Estraier 0.05;
17 dpavlin 141 use YAML qw/LoadFile/;
18 dpavlin 301 use Getopt::Long;
19     use File::Path;
20 dpavlin 74
21 dpavlin 301 =head1 NAME
22 dpavlin 76
23 dpavlin 301 run.pl - start WebPAC indexing
24 dpavlin 141
25 dpavlin 301 B<this command will probably go away. Don't get used to it!>
26 dpavlin 141
27 dpavlin 301 Options:
28    
29     =over 4
30    
31     =item --offset 42
32    
33     start loading (all) databases at offset 42
34    
35     =item --limit 100
36    
37     limit loading to 100 records
38    
39     =item --clean
40    
41     remove database and Hyper Estraier index before indexing
42    
43     =item --config conf/config.yml
44    
45     path to YAML configuration file
46    
47     =back
48    
49     =cut
50    
51     my $offset;
52     my $limit;
53    
54     my $clean = 0;
55     my $config = 'conf/config.yml';
56     my $debug = 0;
57    
58     GetOptions(
59     "limit=i" => \$limit,
60     "offset=i" => \$offset,
61     "clean" => \$clean,
62     "config" => \$config,
63     "debug" => \$debug,
64     );
65    
66     $config = LoadFile($config);
67    
68     print "config = ",Dumper($config) if ($debug);
69    
70 dpavlin 210 die "no databases in config file!\n" unless ($config->{databases});
71 dpavlin 141
72 dpavlin 213 my $total_rows = 0;
73    
74 dpavlin 210 while (my ($database, $db_config) = each %{ $config->{databases} }) {
75 dpavlin 141
76 dpavlin 255 my $log = _new WebPAC::Common()->_get_logger();
77    
78     #
79     # open Hyper Estraier database
80     #
81    
82     my $est_config = $config->{hyperestraier} || $log->logdie("can't find 'hyperestraier' part in confguration");
83     $est_config->{database} = $database;
84    
85     my $est = new WebPAC::Output::Estraier(
86     %{ $est_config },
87     );
88    
89 dpavlin 301 if ($clean) {
90     $log->warn("creating new empty index $database");
91     $est->master( action => 'nodedel', name => $database );
92     $est->master( action => 'nodeadd', name => $database, label => $database );
93     }
94    
95 dpavlin 255 #
96     # now WebPAC::Store
97     #
98 dpavlin 210 my $abs_path = abs_path($0);
99     $abs_path =~ s#/[^/]*$#/#;
100 dpavlin 141
101 dpavlin 210 my $db_path = $config->{webpac}->{db_path} . '/' . $database;
102 dpavlin 74
103 dpavlin 301 if ($clean) {
104     $log->info("creating new database $database in $db_path");
105     rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
106     } else {
107     $log->info("working on $database in $db_path");
108     }
109 dpavlin 255
110 dpavlin 210 my $db = new WebPAC::Store(
111     path => $db_path,
112 dpavlin 217 database => $database,
113 dpavlin 301 debug => $debug,
114 dpavlin 210 );
115 dpavlin 74
116 dpavlin 233
117 dpavlin 213 #
118     # now, iterate through input formats
119     #
120 dpavlin 74
121 dpavlin 213 my @inputs;
122     if (ref($db_config->{input}) eq 'ARRAY') {
123     @inputs = @{ $db_config->{input} };
124 dpavlin 255 } elsif ($db_config->{input}) {
125     push @inputs, $db_config->{input};
126 dpavlin 213 } else {
127 dpavlin 255 $log->info("database $database doesn't have inputs defined");
128 dpavlin 213 }
129 dpavlin 74
130 dpavlin 286 my @supported_inputs = keys %{ $config->{webpac}->{inputs} };
131    
132 dpavlin 213 foreach my $input (@inputs) {
133 dpavlin 233
134     my $type = lc($input->{type});
135    
136 dpavlin 286 die "I know only how to handle input types ", join(",", @supported_inputs), " not '$type'!\n" unless (grep(/$type/, @supported_inputs));
137 dpavlin 233
138     my $lookup = new WebPAC::Lookup(
139     lookup_file => $input->{lookup},
140     );
141    
142 dpavlin 286 my $input_module = $config->{webpac}->{inputs}->{$type};
143 dpavlin 74
144 dpavlin 286 $log->info("working on input $input->{path} [$input->{type}] using $input_module");
145    
146 dpavlin 287 my $input_db = new WebPAC::Input(
147     module => $input_module,
148 dpavlin 213 code_page => $config->{webpac}->{webpac_encoding},
149 dpavlin 301 limit => $limit || $input->{limit},
150     offset => $offset,
151 dpavlin 251 lookup => $lookup,
152 dpavlin 287 );
153 dpavlin 286 $log->logdie("can't create input using $input_module") unless ($input);
154 dpavlin 113
155 dpavlin 287 my $maxmfn = $input_db->open(
156 dpavlin 285 path => $input->{path},
157 dpavlin 213 code_page => $input->{encoding}, # database encoding
158     );
159 dpavlin 113
160 dpavlin 213 my $n = new WebPAC::Normalize::XML(
161     # filter => { 'foo' => sub { shift } },
162     db => $db,
163     lookup_regex => $lookup->regex,
164     lookup => $lookup,
165 dpavlin 221 prefix => $input->{name},
166 dpavlin 213 );
167 dpavlin 113
168 dpavlin 269 my $normalize_path = $input->{normalize}->{path};
169 dpavlin 210
170 dpavlin 269 if ($normalize_path =~ m/\.xml$/i) {
171     $n->open(
172     tag => $input->{normalize}->{tag},
173     xml_file => $input->{normalize}->{path},
174     );
175     } elsif ($normalize_path =~ m/\.(?:yml|yaml)$/i) {
176     $n->open_yaml(
177     path => $normalize_path,
178     tag => $input->{normalize}->{tag},
179     );
180     }
181    
182 dpavlin 290 foreach my $pos ( 0 ... $input_db->size ) {
183 dpavlin 210
184 dpavlin 287 my $row = $input_db->fetch || next;
185 dpavlin 213
186 dpavlin 291 my $mfn = $row->{'000'}->[0];
187 dpavlin 213
188 dpavlin 291 if (! $mfn || $mfn !~ m#^\d+$#) {
189 dpavlin 290 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
190     $mfn = $pos;
191 dpavlin 291 push @{ $row->{'000'} }, $pos;
192 dpavlin 290 }
193    
194 dpavlin 213 my $ds = $n->data_structure($row);
195    
196     $est->add(
197 dpavlin 291 id => $input->{name} . "/" . $mfn,
198 dpavlin 213 ds => $ds,
199     type => $config->{hyperestraier}->{type},
200     );
201    
202     $total_rows++;
203     }
204    
205 dpavlin 210 };
206    
207     $log->info("$total_rows records indexed");
208 dpavlin 255
209     #
210     # add Hyper Estraier links to other databases
211     #
212     if (ref($db_config->{links}) eq 'ARRAY') {
213     foreach my $link (@{ $db_config->{links} }) {
214     $log->info("adding link $database -> $link->{to} [$link->{credit}]");
215     $est->add_link(
216     from => $database,
217     to => $link->{to},
218     credit => $link->{credit},
219     );
220     }
221     }
222    
223 dpavlin 210 }
224    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26