1 |
dpavlin |
1.1 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
my $infile = shift @ARGV || die "$0 [filename]"; |
5 |
|
|
|
6 |
|
|
my $outfile; |
7 |
|
|
sub xmlid2file { |
8 |
|
|
my ($href,$pre,$post) = @_; |
9 |
|
|
$pre |= ''; |
10 |
|
|
$post |= ''; |
11 |
|
|
my $isbn; |
12 |
|
|
if ($href =~ m/xmlid=([^&]+)&/) { |
13 |
|
|
$isbn = $1; |
14 |
|
|
} elsif ($href =~ m/xmlid=([^&]+)$/) { |
15 |
|
|
$isbn = $1; |
16 |
|
|
} else { |
17 |
|
|
print STDERR "skipping $href\n"; |
18 |
|
|
return $pre.$href.$post; |
19 |
|
|
} |
20 |
|
|
$isbn =~ s!%2f!_!gi; |
21 |
|
|
$isbn =~ s!/!_!g; |
22 |
|
|
|
23 |
|
|
my $mode = ""; |
24 |
|
|
if ($href =~ m/mode=([^&]+)&/) { |
25 |
|
|
$mode = $1 |
26 |
|
|
} else { |
27 |
|
|
$mode = "section"; |
28 |
|
|
} |
29 |
|
|
$mode .= "_"; |
30 |
|
|
|
31 |
|
|
my $view; |
32 |
|
|
if ($href =~ m/view=([^&]+)&/) { |
33 |
|
|
$isbn .= "_".$1; |
34 |
|
|
} |
35 |
|
|
|
36 |
|
|
# remove anchor from absolute URLs |
37 |
|
|
$isbn =~ s/#.+$//; |
38 |
|
|
|
39 |
|
|
$isbn .= ".html"; |
40 |
|
|
|
41 |
|
|
# anchor |
42 |
|
|
if ($href =~ m/(#.+)$/) { |
43 |
|
|
$isbn .= $1; |
44 |
|
|
} |
45 |
|
|
|
46 |
|
|
|
47 |
|
|
return $pre.$mode.$isbn.$post; |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
$outfile = xmlid2file($infile); |
51 |
|
|
my $html; |
52 |
|
|
|
53 |
|
|
open(IN,"$infile") || die "$infile: $!"; |
54 |
|
|
while(<IN>) { |
55 |
|
|
chomp; |
56 |
|
|
chomp; |
57 |
|
|
$html .= $_."\n"; |
58 |
|
|
} |
59 |
|
|
close(IN); |
60 |
|
|
|
61 |
|
|
$html =~ s,(<title>)O'Reilly Network Safari Bookshelf\s+-\s+,$1,gsi || die "$infile: title"; |
62 |
|
|
|
63 |
|
|
$html =~ s,<body leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">,<body leftmargin="10" topmargin="10" marginwidth="10" marginheight="10">,s || die "$infile: margins"; |
64 |
|
|
$html =~ s,<a name="toppage">.*<!--Copyright.*?-->,,s || die "$infile: surround layout"; |
65 |
|
|
|
66 |
|
|
$html =~ s,<td valign="top" class="v2">.*?(<td valign="top" class="v2" align="right">),$1,s || die "$infile: top buttons"; |
67 |
|
|
$html =~ s,<td valign="top" class="v2"><a target="_new".*?(<td valign="top" class="v2" align="right">),$1,s || warn "bottom buttons"; |
68 |
|
|
|
69 |
|
|
$html =~ s,<p><b>URL</b>.*$,</body></html>,s || die "$infile: footer"; |
70 |
|
|
|
71 |
|
|
$html =~ s,<!--.+?-->,,gs; |
72 |
|
|
|
73 |
|
|
$html =~ s!(<a\s+[^>]*href=")([^"]+)("[^>]*>)!xmlid2file("$2","$1","$3")!iegs || die "$infile: links"; |
74 |
|
|
|
75 |
|
|
$html =~ s!<a target="_new"[^>]*href="http://[^>]+>(.+?)</a>!$1!gs; |
76 |
|
|
|
77 |
|
|
open(OUT,"> $outfile") || die "$outfile: $!"; |
78 |
|
|
print "$outfile\n"; |
79 |
|
|
print OUT $html; |
80 |
|
|
close(OUT); |