/[webpac2]/trunk/run.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/run.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 301 - (show annotations)
Mon Dec 19 21:26:04 2005 UTC (18 years, 4 months ago) by dpavlin
File MIME type: text/plain
File size: 4739 byte(s)
 r322@athlon:  dpavlin | 2005-12-19 22:27:06 +0100
 make run.pl moderatly chatty (along with other modules), added command line options
 (try perldoc run.pl) new target index (to reindex all) and run (to index
 first 100 records of each database)

1 #!/usr/bin/perl -w
2
3 use strict;
4
5 use Cwd qw/abs_path/;
6 use File::Temp qw/tempdir/;
7 use Data::Dumper;
8 use lib './lib';
9
10 use WebPAC::Common 0.02;
11 use WebPAC::Lookup;
12 use WebPAC::Input 0.03;
13 use WebPAC::Store 0.03;
14 use WebPAC::Normalize::XML;
15 use WebPAC::Output::TT;
16 use WebPAC::Output::Estraier 0.05;
17 use YAML qw/LoadFile/;
18 use Getopt::Long;
19 use File::Path;
20
21 =head1 NAME
22
23 run.pl - start WebPAC indexing
24
25 B<this command will probably go away. Don't get used to it!>
26
27 Options:
28
29 =over 4
30
31 =item --offset 42
32
33 start loading (all) databases at offset 42
34
35 =item --limit 100
36
37 limit loading to 100 records
38
39 =item --clean
40
41 remove database and Hyper Estraier index before indexing
42
43 =item --config conf/config.yml
44
45 path to YAML configuration file
46
47 =back
48
49 =cut
50
51 my $offset;
52 my $limit;
53
54 my $clean = 0;
55 my $config = 'conf/config.yml';
56 my $debug = 0;
57
58 GetOptions(
59 "limit=i" => \$limit,
60 "offset=i" => \$offset,
61 "clean" => \$clean,
62 "config" => \$config,
63 "debug" => \$debug,
64 );
65
66 $config = LoadFile($config);
67
68 print "config = ",Dumper($config) if ($debug);
69
70 die "no databases in config file!\n" unless ($config->{databases});
71
72 my $total_rows = 0;
73
74 while (my ($database, $db_config) = each %{ $config->{databases} }) {
75
76 my $log = _new WebPAC::Common()->_get_logger();
77
78 #
79 # open Hyper Estraier database
80 #
81
82 my $est_config = $config->{hyperestraier} || $log->logdie("can't find 'hyperestraier' part in confguration");
83 $est_config->{database} = $database;
84
85 my $est = new WebPAC::Output::Estraier(
86 %{ $est_config },
87 );
88
89 if ($clean) {
90 $log->warn("creating new empty index $database");
91 $est->master( action => 'nodedel', name => $database );
92 $est->master( action => 'nodeadd', name => $database, label => $database );
93 }
94
95 #
96 # now WebPAC::Store
97 #
98 my $abs_path = abs_path($0);
99 $abs_path =~ s#/[^/]*$#/#;
100
101 my $db_path = $config->{webpac}->{db_path} . '/' . $database;
102
103 if ($clean) {
104 $log->info("creating new database $database in $db_path");
105 rmtree( $db_path ) || $log->warn("can't remove $db_path: $!");
106 } else {
107 $log->info("working on $database in $db_path");
108 }
109
110 my $db = new WebPAC::Store(
111 path => $db_path,
112 database => $database,
113 debug => $debug,
114 );
115
116
117 #
118 # now, iterate through input formats
119 #
120
121 my @inputs;
122 if (ref($db_config->{input}) eq 'ARRAY') {
123 @inputs = @{ $db_config->{input} };
124 } elsif ($db_config->{input}) {
125 push @inputs, $db_config->{input};
126 } else {
127 $log->info("database $database doesn't have inputs defined");
128 }
129
130 my @supported_inputs = keys %{ $config->{webpac}->{inputs} };
131
132 foreach my $input (@inputs) {
133
134 my $type = lc($input->{type});
135
136 die "I know only how to handle input types ", join(",", @supported_inputs), " not '$type'!\n" unless (grep(/$type/, @supported_inputs));
137
138 my $lookup = new WebPAC::Lookup(
139 lookup_file => $input->{lookup},
140 );
141
142 my $input_module = $config->{webpac}->{inputs}->{$type};
143
144 $log->info("working on input $input->{path} [$input->{type}] using $input_module");
145
146 my $input_db = new WebPAC::Input(
147 module => $input_module,
148 code_page => $config->{webpac}->{webpac_encoding},
149 limit => $limit || $input->{limit},
150 offset => $offset,
151 lookup => $lookup,
152 );
153 $log->logdie("can't create input using $input_module") unless ($input);
154
155 my $maxmfn = $input_db->open(
156 path => $input->{path},
157 code_page => $input->{encoding}, # database encoding
158 );
159
160 my $n = new WebPAC::Normalize::XML(
161 # filter => { 'foo' => sub { shift } },
162 db => $db,
163 lookup_regex => $lookup->regex,
164 lookup => $lookup,
165 prefix => $input->{name},
166 );
167
168 my $normalize_path = $input->{normalize}->{path};
169
170 if ($normalize_path =~ m/\.xml$/i) {
171 $n->open(
172 tag => $input->{normalize}->{tag},
173 xml_file => $input->{normalize}->{path},
174 );
175 } elsif ($normalize_path =~ m/\.(?:yml|yaml)$/i) {
176 $n->open_yaml(
177 path => $normalize_path,
178 tag => $input->{normalize}->{tag},
179 );
180 }
181
182 foreach my $pos ( 0 ... $input_db->size ) {
183
184 my $row = $input_db->fetch || next;
185
186 my $mfn = $row->{'000'}->[0];
187
188 if (! $mfn || $mfn !~ m#^\d+$#) {
189 $log->warn("record $pos doesn't have valid MFN but '$mfn', using $pos");
190 $mfn = $pos;
191 push @{ $row->{'000'} }, $pos;
192 }
193
194 my $ds = $n->data_structure($row);
195
196 $est->add(
197 id => $input->{name} . "/" . $mfn,
198 ds => $ds,
199 type => $config->{hyperestraier}->{type},
200 );
201
202 $total_rows++;
203 }
204
205 };
206
207 $log->info("$total_rows records indexed");
208
209 #
210 # add Hyper Estraier links to other databases
211 #
212 if (ref($db_config->{links}) eq 'ARRAY') {
213 foreach my $link (@{ $db_config->{links} }) {
214 $log->info("adding link $database -> $link->{to} [$link->{credit}]");
215 $est->add_link(
216 from => $database,
217 to => $link->{to},
218 credit => $link->{credit},
219 );
220 }
221 }
222
223 }
224

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26