1 |
#!/usr/bin/perl -w |
2 |
|
3 |
use strict; |
4 |
my $infile = shift @ARGV || die "$0 [filename]"; |
5 |
|
6 |
my $outfile; |
7 |
sub xmlid2file { |
8 |
my ($href,$pre,$post) = @_; |
9 |
$pre |= ''; |
10 |
$post |= ''; |
11 |
my $isbn; |
12 |
if ($href =~ m/xmlid=([^&]+)&/) { |
13 |
$isbn = $1; |
14 |
} elsif ($href =~ m/xmlid=([^&]+)$/i) { |
15 |
$isbn = $1; |
16 |
} else { |
17 |
print STDERR "skipping $href\n"; |
18 |
return $pre.$href.$post; |
19 |
} |
20 |
$isbn =~ s!%2f!_!gi; |
21 |
$isbn =~ s!/!_!g; |
22 |
|
23 |
my $mode = ""; |
24 |
if ($href =~ m/mode=([^&]+)&/) { |
25 |
$mode = $1 |
26 |
} else { |
27 |
$mode = "section"; |
28 |
} |
29 |
$mode .= "_"; |
30 |
|
31 |
my $view; |
32 |
if ($href =~ m/view=([^&]+)&/) { |
33 |
$isbn .= "_".$1; |
34 |
} |
35 |
|
36 |
# remove anchor from absolute URLs |
37 |
$isbn =~ s/#.+$//; |
38 |
|
39 |
$isbn .= ".html"; |
40 |
|
41 |
# anchor |
42 |
if ($href =~ m/(#.+)$/) { |
43 |
$isbn .= $1; |
44 |
} |
45 |
|
46 |
|
47 |
return $pre.$mode.$isbn.$post; |
48 |
} |
49 |
|
50 |
$outfile = xmlid2file($infile); |
51 |
my $html; |
52 |
|
53 |
open(IN,"$infile") || die "$infile: $!"; |
54 |
while(<IN>) { |
55 |
chomp; |
56 |
chomp; |
57 |
$html .= $_."\n"; |
58 |
} |
59 |
close(IN); |
60 |
|
61 |
$html =~ s!(<title>)O'Reilly Network Safari Bookshelf\s+-\s+!$1!gsi || die "$infile: title"; |
62 |
|
63 |
$html =~ s!<body leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">!<body leftmargin="10" topmargin="10" marginwidth="10" marginheight="10">!s || die "$infile: margins"; |
64 |
$html =~ s;<a name="toppage">.*<!--Copyright.*?-->;;s || die "$infile: surround layout"; |
65 |
|
66 |
$html =~ s!<td valign="top" class="v2">.*?(<td valign="top" class="v2" align="right">)!$1!s || die "$infile: top buttons"; |
67 |
$html =~ s!<td valign="top" class="v2"><a target="_new".*?(<td valign="top" class="v2" align="right">)!$1!s || warn "bottom buttons"; |
68 |
|
69 |
$html =~ s!<p><b>URL</b>.*$!</body></html>!s || die "$infile: footer"; |
70 |
|
71 |
$html =~ s;<!--.+?-->;;gs; |
72 |
|
73 |
$html =~ s!(<a\s+[^>]*href=")([^"]+)("[^>]*>)!xmlid2file("$2","$1","$3")!iegs || die "$infile: links"; |
74 |
|
75 |
$html =~ s!<a target="_new"[^>]*href="http://[^>]+>(.+?)</a>!$1!gs; |
76 |
|
77 |
open(OUT,"> $outfile") || die "$outfile: $!"; |
78 |
print "$outfile\n"; |
79 |
print OUT $html; |
80 |
close(OUT); |
81 |
|
82 |
|
83 |
# fix timestamp |
84 |
# atime = 8, ctime = 9 |
85 |
my @s = stat($infile) || die "stat $infile: $!"; |
86 |
utime $s[8],$s[9], $outfile || die "touch $outfile: $1"; |