/[wait]/trunk/script/cpanwait
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/script/cpanwait

Parent Directory Parent Directory | Revision Log Revision Log


Revision 88 - (show annotations)
Mon May 24 13:44:01 2004 UTC (20 years ago) by dpavlin
File size: 19470 byte(s)
move cvs-head to trunk

1 #!/usr/bin/perl -w
2 ######################### -*- Mode: Perl -*- #########################
3 ##
4 ## $Basename: cpanwait $
5 ## $Revision: 1.7 $
6 ##
7 ## Author : Ulrich Pfeifer
8 ## Created On : Sat Jan 4 18:09:28 1997
9 ##
10 ## Last Modified By : Ulrich Pfeifer
11 ## Last Modified On : Sun Nov 22 18:44:36 1998
12 ##
13 ## Copyright (c) 1996-1997, Ulrich Pfeifer
14 ##
15 ##
16 ######################################################################
17
18 use strict;
19
20 use File::Path;
21 use DB_File;
22 use Getopt::Long;
23 use File::Find;
24 use File::Basename;
25 use IO::File;
26
27 use lib '/data/wait/lib';
28
29 require WAIT::Config;
30 require WAIT::Database;
31 require WAIT::Parse::Pod;
32 require WAIT::Document::Tar;
33
34 sub fname($);
35
36 # maximum number of archives to index (set to -1 for unlimited)
37 my $max = -1;
38
39 my %OPT = (database => 'DB',
40 dir => $WAIT::Config->{WAIT_home} || '/tmp',
41 table => 'cpan',
42 clean => 0,
43 remove => [],
44 force => 0,
45 # cpan => '/usr/src/perl/CPAN/sources',
46 cpan => '/rest/cpan/CPAN/',
47 trust_mtime => 1,
48 match => 'authors/id/',
49 test => 0,
50 # cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN',
51 # cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN',
52 keep => '/tmp/CPAN/',
53 );
54
55 GetOptions(\%OPT,
56 'database=s',
57 'dir=s',
58 'cpan=s',
59 'table=s',
60 'keep=s',
61 'match=s',
62 'clean!',
63 'test=i', # test level 0: normal
64 # 1: don't change db
65 # 2: don't look at archives even
66
67 'remove=s@',
68 'force!', # force indexing even if seen
69 'trust_mtime!', # use mtime instead of version number
70 'max=i',
71 'reorg!',
72 ) || die "Usage: ...\n";
73
74 $max ||= $OPT{max};
75
76 clean_database(
77 database => $OPT{database},
78 dir => $OPT{dir},
79 table => $OPT{table},
80 ) if $OPT{clean};
81
82 my $db = WAIT::Database->open(
83 name => $OPT{database},
84 'directory' => $OPT{dir},
85 )
86 || WAIT::Database->create(
87 name => $OPT{database},
88 'directory' => $OPT{dir},
89 )
90 or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@";
91
92 my $layout= new WAIT::Parse::Pod;
93
94 my $tb = $db->table(name => $OPT{table})
95 || create_table(db => $db, table => $OPT{table}, layout => $layout);
96
97 # Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version
98 # considerations. Value *must* match common prefix. Aliasing should be
99 # used if CPAN contains serveral distributions with different name but
100 # same root directory.
101 # We still have a problem if there are different root directories!
102
103 my %ALIAS = (# tar name real (root) name
104 'latest' => 'perl',
105 'perl5db-kit' => 'DB',
106 'SGI-FM' => 'FM',
107 'net-ext' => 'Net',
108 'VelocisSQL' => 'Velocis',
109 'Net-ext' => 'Net',
110 'Curses-DevKit' => 'Cdk',
111 'PostgresPerl' => 'Postgres',
112 'perlpdf' => 'PERLPDF',
113 'Des-perl' => 'Des',
114 'SGI-GL' => 'GL',
115 'DBD-DB2' => 'DB2',
116 );
117 my %NEW_ALIAS; # found in this pass
118
119 # Map module names to pathes. Generated by wanted() doing alisaing.
120 my %ARCHIVE;
121
122 # Map module names to latest version. Generated by wanted()
123 my %VERSION;
124
125
126 # Mapping for modules with common root not matching modules name that
127 # are not aliased. This is just for prefix stripping and not strictly
128 # necessary. Takes effect after version considerations.
129 my %TR = (# tar name root to strip
130 'Net_SSLeay.pm' => 'SSLeay/',
131 'EventDrivenServer' => 'Server/',
132 'bio_lib.pl.' => '',
133 'AlarmCall' => 'Sys/',
134 'Cdk-ext' => 'Cdk/',
135 'Sx' => '\d.\d/',
136 'DumpStack' => 'Devel/',
137 'StatisticsDescriptive'=> 'Statistics/',
138 'Term-Gnuplot' => 'Gnuplot/',
139 'iodbc_ext' => 'iodbc-ext-\d.\d/',
140 'UNIVERSAL' => '',
141 'Term-Query' => 'Query/',
142 'SelfStubber' => 'Devel/',
143 'CallerItem' => 'Devel/',
144 );
145
146 my $DIR = $tb->dir;
147 my $DATA = $tb->dir . "/data";
148 my $LWP;
149
150
151 if (@{$OPT{remove}}) {
152 my $pod;
153 for $pod (@{$OPT{remove}}) {
154 unless (-e $pod) {
155 $pod = "$DIR/$pod";
156 }
157 index_pod(file => $pod, remove => 1) if -f $pod;
158 unlink $pod or warn "Could not unlink '$pod': $!\n";
159 #$tb->sync;
160 }
161 $tb->close;
162 $db->close;
163 exit;
164 }
165
166 # Now get the beef
167 if ($OPT{cpan} =~ /^(http|ftp):/) {
168 $LWP = 1;
169 require LWP::Simple;
170 LWP::Simple->import();
171
172 mkpath($DATA,1,0755) or
173 die "Could not generate '$DATA/': $!"
174 unless -d $DATA;
175
176 if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) {
177 my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz");
178 if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
179 # we could use Net:FTP here ...
180 die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n";
181 }
182 }
183 my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |";
184 die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh;
185
186 my $line;
187 while (defined ($line = <$fh>)) {
188 chomp($line);
189 my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11];
190
191 next if defined $is_link;
192 my $mtime = mtime($mon, $mday, $time);
193
194 $file =~ s:^\./::;
195 ($_) = fileparse($file);
196 $File::Find::name = $file;
197 wanted($mtime);
198 }
199 } else {
200 find(sub {&wanted((stat($_))[9])}, $OPT{cpan});
201 }
202
203 ARCHIVE:
204 for my $tar (sort keys %ARCHIVE) {
205 next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o;
206 my $base = (split /\//, $ARCHIVE{$tar})[-1];
207 my $parent;
208
209 # logging
210 if ($OPT{trust_mtime}) {
211 printf "%-20s %10s %s\t", $tar,
212 substr(scalar(localtime($VERSION{$tar})),0,10), $base;
213 } else {
214 printf "%-20s %10.5f %s\t", $tar, $VERSION{$tar}, $base;
215 }
216
217 # Remember the archive
218 # We should have an extra table for the tar file data ...
219 if (!$OPT{force} and $tb->have(docid => $base)) {
220 print "skipping\n";
221 next ARCHIVE;
222 } else {
223 $parent = $tb->insert(docid => $base,
224 headline => $ARCHIVE{$tar}) unless $OPT{test};
225 print "indexing\n";
226 }
227
228 next ARCHIVE if $OPT{test} > 1;
229
230 my $TAR = myget($tar);
231
232 next ARCHIVE unless $TAR; # not able to fetch it
233
234 my %tar;
235 tie (%tar,
236 'WAIT::Document::Tar',
237 sub { $_[0] =~ /\.(pm|pod|PL)$/i or $_[0] =~ /readme/i},
238 #sub { $_[0] !~ m:/$: },
239 $TAR)
240 or warn "Could not tie '$TAR'\n";
241
242 my $sloppy;
243 my ($key, $val);
244
245 FILE:
246 while (($key, $val) = each %tar) {
247 my $file = fname($key);
248
249 # don't index directories
250 next if $file =~ /\/$/;
251
252 # is it a POD file?
253 next FILE unless $file =~ /readme/i or $val =~ /\n=head/;
254
255 # remove directory prefix
256 unless ($sloppy # no common root
257 or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias
258 or ($TR{$tar} # common root, not aliased
259 and $file =~ s:^\Q$TR{$tar}\E::)
260 ) {
261 # try to determine an alias
262 warn "Bad directory prefix: '$file'\n";
263 my ($prefix) = split /\//, $file;
264
265 while ($key = (tied %tar)->NEXTKEY) {
266 my $file = fname($key);
267
268 next if $file =~ /\/$/;
269 unless ($file =~ m:^$prefix/: or $file eq $prefix) {
270 warn "Archive contains different prefixes: $prefix,$file\n";
271 $prefix = '';
272 last;
273 }
274 }
275 if ($prefix) {
276 print "Please alias '$tar' to '$prefix' next time!\n";
277 print "See alias table later.\n";
278 $NEW_ALIAS{$tar} = $prefix;
279 $tb->delete_by_key($parent);
280 next ARCHIVE;
281 } else {
282 print "Assuming that tar file name $tar is a valid prefix\n";
283 $sloppy = 1;
284
285 # We may reset too much here! But that this is not exact
286 # science anyway. Maybe we should ignore using 'next ARCHIVE'.
287
288 $key = (tied %tar)->FIRSTKEY;
289 redo FILE;
290 }
291 }
292
293 # remove /lib prefix
294 $file =~ s:^lib/::;
295
296 # generate new path
297 my $path = "$DATA/$tar/$file";
298
299 my ($sbase, $sdir) = fileparse($path);
300 my $fh;
301
302 unless ($OPT{test}) {
303 if (-f $path) {
304 index_pod(file => $path, remove => 1);
305 unlink $path or warn "Could not unlink '$path' $!\n";
306 } elsif (!-d $sdir) {
307 mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n";
308 }
309 $fh = new IO::File "> $path";
310 die "Could not write '$path': $!\n" unless $fh;
311 }
312
313 if ($file =~ /readme|install/i) { # make READMEs verbatim pods
314 $val =~ s/\n/\n /g;
315 $val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val"
316 unless $val =~ /^=head/m;
317 } else { # remove non-pod stuff
318 my $nval = $val; $val = '';
319 my $cutting = 1;
320
321 for (split /\n/, $nval) {
322 if (/^=cut|!NO!SUBS!/) {
323 $cutting = 1;
324 } elsif ($cutting and /^=head/) {
325 $cutting = 0;
326 }
327 unless ($cutting) {
328 $val .= $_ . "\n";
329 }
330 }
331 }
332 unless ($OPT{test}) {
333 $fh->print($val);
334 index_pod(file => $path, parent => $parent,
335 text => $val, source => $ARCHIVE{$tar});
336 }
337 }
338
339 if ($LWP and !$OPT{keep}) {
340 unlink $TAR or warn
341 "Could not unlink '$TAR': $!\n";
342 }
343 }
344
345 if (%NEW_ALIAS) {
346 print "\%ALIAS = (\n";
347 for (keys %NEW_ALIAS) {
348 print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n";
349 }
350 print "\t);\n";
351 }
352
353 if ($OPT{reorg}) {
354 my $now = time;
355 warn "Starting reorg\n";
356 $tb->set(top=>1);
357 warn sprintf "Finished reorg %d seconds\n", time - $now;
358 }
359
360 # we are done
361 $db->close();
362 exit;
363
364 sub fname ($) {
365 my $key = shift;
366 my ($ntar, $file) = split $;, $key;
367
368 # remove leading './' - shudder
369 $file =~ s/^\.\///;
370
371 return($file);
372 }
373
374 sub myget {
375 my $tar = shift;
376 my $TAR;
377
378 if ($LWP) { # fetch the archive
379 if ($OPT{keep}) {
380 $TAR = "$OPT{keep}/$ARCHIVE{$tar}";
381 print "Keeping in '$TAR'\n" unless -e $TAR;
382 my ($base, $path) = fileparse($TAR);
383 unless (-d $path) {
384 mkpath($path,1,0755) or
385 die "Could not mkpath($path)\n";
386 }
387 } else {
388 $TAR = "/tmp/$tar.tar.gz";
389 }
390 unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case?
391 print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n";
392 my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR);
393 if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) {
394 warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n";
395 return;
396 }
397 }
398 } else {
399 $TAR = $ARCHIVE{$tar};
400 }
401 $TAR;
402 }
403
404 sub index_pod {
405 my %parm = @_;
406 my $did = $parm{file};
407 my $rel_did = $did;
408 my $abs_did = $did;
409
410 if ($rel_did =~ s:$DIR/::) {
411 $abs_did = "$DIR/$rel_did";
412 }
413
414 undef $did;
415
416 # check for both variants
417 if ($tb->have('docid' => $rel_did)) {
418 $did = $rel_did;
419 } elsif ($tb->have('docid' => $abs_did)) {
420 $did = $abs_did;
421 }
422 if ($did) { # have it version
423 if (!$parm{remove}) {
424 warn "duplicate: $did\n";
425 return;
426 }
427 } else { # not seen yet
428 $did = $rel_did;
429 if ($parm{remove}) {
430 print "missing: $did\n";
431 return;
432 }
433 }
434
435 $parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did);
436
437 unless (defined $parm{'text'}) {
438 print "unavailable: $did\n";
439 return;
440 }
441
442 my $record = $layout->split($parm{'text'});
443
444 if (! $record) {
445 print "empty pod: $did\n";
446 return;
447 }
448
449 $record->{size} = length($parm{'text'});
450 my $headline = $record->{name} || $did;
451
452 $headline =~ s/^$DATA//o; # $did
453 $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
454
455 printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70);
456 if ($parm{remove}) {
457 $tb->delete('docid' => $did,
458 headline => $headline,
459 %{$record});
460 } else {
461 $tb->insert('docid' => $did,
462 headline => $headline,
463 source => $parm{source},
464 parent => $parm{parent},
465 %{$record});
466 }
467 }
468
469 # This *must* remove the version in *any* case. It should compute a
470 # resonable version number - but usually mtimes should be used.
471 sub version {
472 local ($_) = @_;
473
474 # remove alpha/beta postfix
475 s/([-_\d])(a|b|alpha|beta|src)$/$1/;
476
477 # jperl1.3@4.019.tar.gz
478 s/@\d.\d+//;
479
480 # oraperl-v2.4-gk.tar.gz
481 s/-v(\d)/$1/;
482
483 # lettered versions - shudder
484 s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei;
485 s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei;
486
487 # thanks libwww-5b12 ;-)
488 s/(\d+)b/($1-1).'.'/e;
489 s/(\d+)a/($1-2).'.'/e;
490
491 # replace '-pre' by '0.'
492 s/-pre([\.\d])/-0.$1/;
493 s/\.\././g;
494 s/(\d)_(\d)/$1$2/g;
495
496 # chop '[-.]' and thelike
497 s/\W$//;
498
499 # ram's versions Storable-0.4@p
500 s/\@/./;
501
502 if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) {
503 return($_, $1 + "0.$2" + $3 / 1000000);
504 } elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) {
505 return($_, $1 + $2/1000 + $3 / 1000000);
506 } elsif (s/[-_]?(\d+\.[\d_]+)$//) {
507 return($_, $1);
508 } elsif (s/[-_]?([\d_]+)$//) {
509 return($_, $1);
510 } elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide
511 return($_, $1);
512 } else {
513 if ($_ =~ /\d/) { # smells like an unknown scheme
514 warn "Odd version Numbering: '$File::Find::name'\n";
515 return($_, undef);
516 } else { # assume version 0
517 warn "No version Numbering: '$File::Find::name'\n";
518 return($_, 0);
519 }
520
521 }
522 }
523
524 sub wanted {
525 my $mtime = shift; # called by parse_file_ls();
526
527 return unless /^(.*)\.tar(\.gz|\.Z)$/;
528
529 return if (! $max);
530 $max--;
531
532 my ($archive, $version) = version($1);
533
534 unless (defined $version) {
535 warn "Skipping $1\n";
536 return;
537 }
538
539 # Check for file alias
540 $archive = $ALIAS{$archive} if $ALIAS{$archive};
541
542 # Check for path alias.
543 if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) {
544 if ($ALIAS{$1}) {
545 $archive = $ALIAS{$1};
546 }
547 }
548
549 if ($OPT{trust_mtime}) {
550 $version = $mtime;
551 } else {
552 $version =~ s/(\d)_/$1/;
553 $version ||= $mtime; # mtime
554 }
555
556 if (!exists $ARCHIVE{$archive}
557 or $VERSION{$archive} < $version) {
558 $ARCHIVE{$archive} = $File::Find::name;
559 $VERSION{$archive} = $version;
560 }
561 }
562
563 sub clean_database {
564 my %parm = @_;
565
566 my $db = WAIT::Database->open(
567 name => $parm{database},
568 'directory' => $parm{dir},
569 )
570 or die "Could not open database '$parm{dir}/$parm{database}': $@";
571 my $tbl = $db->table(name => $parm{table});
572 if ($tbl) {
573 $tbl->drop or
574 die "Could not open table '$parm{tabel}': $@";
575 }
576
577 $db->close;
578 }
579
580 sub create_table {
581 my %parm = @_;
582
583 my $access = bless {}, 'WAIT::Document::Find';
584
585 my $stem = [{
586 'prefix' => ['isotr', 'isolc'],
587 'intervall' => ['isotr', 'isolc'],
588 }, 'isotr', 'isolc', 'split2', 'stop', 'Stem'];
589 my $text = [{
590 'prefix' => ['isotr', 'isolc'],
591 'intervall' => ['isotr', 'isolc'],
592 },
593 'isotr', 'isolc', 'split2', 'stop'];
594 my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],;
595
596 my $tb =
597 $parm{db}->create_table
598 (name => $parm{table},
599 attr => ['docid', 'headline', 'source', 'size', 'parent'],
600 keyset => [['docid']],
601 layout => $parm{layout},
602 access => $access,
603 invindex =>
604 [
605 'name' => $stem,
606 'synopsis' => $stem,
607 'bugs' => $stem,
608 'description' => $stem,
609 'text' => $stem,
610 'environment' => $text,
611 'example' => $text, 'example' => $stem,
612 'author' => $sound, 'author' => $stem,
613 ]
614 );
615 die "Could not create table '$parm{table}'" unless $tb;
616 $tb;
617 }
618
619 my %MON;
620 my $YEAR;
621
622 BEGIN {
623 my $i = 1;
624 for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) {
625 $MON{$_} = $i++;
626 }
627 $YEAR = (localtime(time))[5];
628 }
629
630 # We could/should use Date::GetDate here
631 use Time::Local;
632 sub mtime {
633 my ($mon, $mday, $time) = @_;
634 my ($hour, $min, $year, $monn) = (0,0);
635
636 if ($time =~ /(\d+):(\d+)/) {
637 ($hour, $min) = ($1, $2);
638 $year = $YEAR;
639 } else {
640 $year = $time;
641 }
642 $monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'";
643 my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year);
644 if ($guess > time) {
645 $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1);
646 }
647 $guess;
648 }
649
650
651 __END__
652 ## ###################################################################
653 ## pod
654 ## ###################################################################
655
656 =head1 NAME
657
658 cpanwait - generate an WAIT index for CPAN
659
660 =head1 SYNOPSIS
661
662 B<cpanwait>
663 [B<-clean>] [B<-noclean>]
664 [B<-cpan> I<url or directory>]
665 [B<-database> I<dbname>]
666 [B<-dir> I<directory>]
667 [B<-force>] [B<-noforce>]
668 [B<-keep> I<directory>]
669 [B<-match> I<regexp>]
670 [B<-table> I<table name>]
671 [B<-test> I<level>]
672 [B<-trust_mtime>] [B<-notrust_mtime>]
673
674 =head1 DESCRIPTION
675
676 TBS
677
678 =head1 OPTIONS
679
680 =over 5
681
682 =item B<-clean> / B<-noclean>
683
684 Clean the table befor indexing. Default is B<off>.
685
686 =item B<-cpan> I<url or directory>
687
688 Default directory or URL for indexing. If an URL is given, there
689 currently must be a file F<indices/find-ls.gz> relative to it which
690 contains the output of C<find . -ls | gzip>.
691 Default is F<ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN>.
692
693
694 =item B<-database> I<dbname>
695
696 Specify database name. Default is F<DB>.
697
698 =item B<-dir> I<directory>
699
700 Alternate directory were databases are located. Default is the
701 directory specified during configuration of WAIT.
702
703 =item B<-force>
704
705 Force reindexing, even if B<cpan> thinks files are up to date.
706 Default is B<off>
707
708 =item B<-keep> I<directory>
709
710 If fetching from a remote server, keep files in I<directory>. Default is
711 F</app/unido-i06/src/share/lang/perl/96a/CPAN/sources>.
712
713 =item B<-match> I<regexp>
714
715 Limit to patches matching I<regexp>. Default is F<authors/id/>.
716
717 =item B<-table> I<table name>
718
719 Specify an alternate table name. Default is C<cpan>.
720
721 =item B<-test> I<level>
722
723 Set test level, were B<0> means normal operation, B<1> means, don't
724 really index and B<2> means, don't even get archives and examine them.
725
726 =item B<-trust_mtime> / B<-notrust_mtime>
727
728 If B<on>, the files mtimes are used to decide, which version of an
729 archive is the newest. If b<off>, the version extracted is used
730 (beware, there are far more version numbering schemes than B<cpan> can
731 parse).
732
733 =head1 AUTHOR
734
735 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortumund.de>E<gt>

Properties

Name Value
cvs2svn:cvs-rev 1.1
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26