1 |
#!/usr/bin/perl |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.9 $ |
# $Revision: 1.10 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
43 |
|
|
44 |
my $debug = 0; |
my $debug = 0; |
45 |
|
|
46 |
my $p = HTML::Parser->new( |
my $globalp = HTML::Parser->new( |
47 |
api_version => 3, |
api_version => 3, |
48 |
start_h => [\&handle_start, "tagname, attr"], |
start_h => [\&handle_start, "tagname, attr"], |
49 |
end_h => [\&handle_end, "tagname"], |
end_h => [\&handle_end, "tagname"], |
50 |
text_h => [\&handle_text, "dtext"], |
text_h => [\&handle_text, "dtext"], |
51 |
marked_sections => 1, |
marked_sections => 1, |
52 |
); |
); |
53 |
my %result; |
my %result; |
54 |
my $text; |
my $text; |
55 |
my $open; |
my $open; |
56 |
my $div; |
my $div; |
57 |
|
|
58 |
|
sub initialize_text { |
59 |
|
if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) { |
60 |
|
$text = "\x{100}"; |
61 |
|
} else { |
62 |
|
$text = ""; |
63 |
|
} |
64 |
|
} |
65 |
|
|
66 |
|
sub finished_text { |
67 |
|
if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) { |
68 |
|
$text =~ s/^\x{100}//; |
69 |
|
} |
70 |
|
$text =~ s/^\s+//; |
71 |
|
$text =~ s/\s+$//; |
72 |
|
$text =~ s/\s+/ /g; |
73 |
|
$text; |
74 |
|
} |
75 |
|
|
76 |
sub handle_start { |
sub handle_start { |
77 |
my $tag = shift; |
my $tag = shift; |
78 |
my $attr = shift; |
my $attr = shift; |
79 |
|
|
80 |
return unless $tag eq "div"; |
return unless $tag eq "div"; |
81 |
$div = $attr->{id}; |
$div = $attr->{id}; |
82 |
|
utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV(); |
83 |
$open++; |
$open++; |
84 |
print ">" x $open, $tag, "\n" if $debug; |
print ">" x $open, $tag, "\n" if $debug; |
85 |
} |
} |
91 |
print "<" x $open, $tag, "\n" if $debug; |
print "<" x $open, $tag, "\n" if $debug; |
92 |
$open--; |
$open--; |
93 |
return unless $div; |
return unless $div; |
|
$text =~ s/^\s+//; |
|
|
$text =~ s/\s+$//; |
|
|
$text =~ s/\s+/ /g; |
|
94 |
if (defined $result{$div}){ |
if (defined $result{$div}){ |
95 |
$result{$div} .= " $text"; |
$result{$div} .= " " . finished_text(); |
96 |
} else { |
} else { |
97 |
$result{$div} = $text; |
$result{$div} = finished_text(); |
98 |
} |
} |
99 |
$text = ''; |
initialize_text(); |
100 |
} |
} |
101 |
|
|
102 |
|
|
106 |
} |
} |
107 |
|
|
108 |
|
|
109 |
|
# WAIT::Parse::Ora::my_parse |
110 |
sub my_parse ($) { |
sub my_parse ($) { |
111 |
my($s) = @_; |
my($s) = @_; |
112 |
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns |
my $ls; |
113 |
# LATIN for entities |
if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) { |
114 |
# and we would get |
warn "Warning: this HTML::Parser has Unicode support on" |
115 |
# mixed content in |
if HTML::Entities::UNICODE_SUPPORT(); |
116 |
# result |
$ls = Encode::encode("ISO-8859-1", $s, 1); |
117 |
|
# HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities. |
118 |
|
# We reduce the amount of mixed-encoding content to just a few |
119 |
|
# punctuation characters when we work with Latin here. |
120 |
|
|
121 |
|
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) { |
122 |
|
$ls = Encode::encode_utf8($s); |
123 |
|
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) { |
124 |
|
$ls = $s; |
125 |
|
$ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge; |
126 |
|
utf8::downgrade($ls); # didn't improve the coredumpness |
127 |
|
die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/; |
128 |
|
# warn "ls[$ls]"; |
129 |
|
} else { |
130 |
|
$ls = $s; |
131 |
|
} |
132 |
%result = (); |
%result = (); |
133 |
$text = ''; |
initialize_text(); |
134 |
$open = 0; |
$open = 0; |
135 |
$div = undef; |
$div = undef; |
136 |
$p->parse($ls); |
if (0) { # XXX probieren ueber probieren wg Entities und UTF-8 |
137 |
$p->eof; |
# code that directly deals with $s because it doesn't want the |
138 |
} |
# conversion to $ls (latin1) |
139 |
|
|
140 |
|
$s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF |
141 |
|
my $pre_s = $s; |
142 |
|
# $s =~ s/™//g; |
143 |
|
# $s =~ s/\302\255//g; # 2.8 pounds in |
144 |
|
if ( 0 && $HTML::Parser::VERSION == 3.26 ) { |
145 |
|
# Should be handled by HTML::Entities, numeric entities and |
146 |
|
# HTML::Entities and -DUNICODE_ENTITIES => core dump |
147 |
|
|
148 |
|
my $saidinfo = 0; |
149 |
|
local $| = 1; |
150 |
|
while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) { |
151 |
|
print "Info:" unless $saidinfo++; |
152 |
|
print " &#$1;"; |
153 |
|
} |
154 |
|
print "\n" if $saidinfo; |
155 |
|
|
156 |
|
} |
157 |
|
} |
158 |
|
if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) { |
159 |
|
my $p = HTML::Parser->new( |
160 |
|
api_version => 3, |
161 |
|
start_h => [\&handle_start, "tagname, attr"], |
162 |
|
end_h => [\&handle_end, "tagname"], |
163 |
|
text_h => [\&handle_text, "dtext"], |
164 |
|
marked_sections => 1, |
165 |
|
); |
166 |
|
$p->parse($ls); |
167 |
|
$p->eof; |
168 |
|
} else { |
169 |
|
$globalp->parse($ls); |
170 |
|
$globalp->eof; |
171 |
|
} |
172 |
|
if (0) { # XXX |
173 |
|
# code that tries to postprocess the nonsense resulting from the above |
174 |
|
while (my($k,$v) = each %result) { |
175 |
|
next unless defined($v) && length($v); |
176 |
|
next if Encode::is_utf8($v); |
177 |
|
next unless $v =~ /[^\040-\177]/; |
178 |
|
# Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt |
179 |
|
my $utf8v; |
180 |
|
if (HTML::Entities::UNICODE_SUPPORT()) { |
181 |
|
if (0) { |
182 |
|
# klappt nicht == 2002-04-02 |
183 |
|
$utf8v = Encode::decode("ISO-8859-1",$v); |
184 |
|
} elsif (1) { |
185 |
|
$utf8v = $v; |
186 |
|
# fuehrt zu "unexpected downgraded strings" und die haben dann |
187 |
|
# noch ein UTF-8 Teile, die nicht als solche markiert sind |
188 |
|
} |
189 |
|
} else { |
190 |
|
# Want to find out which condition we need to watch |
191 |
|
if ($HTML::Parser::VERSION != 3.26) { |
192 |
|
# klappt nicht == 2002-04-02 |
193 |
|
$utf8v = Encode::decode("ISO-8859-1",$v); |
194 |
|
} else { |
195 |
|
# klappt nicht == 2002-04-02 |
196 |
|
$utf8v = $v; |
197 |
|
Encode::_utf8_on($utf8v); |
198 |
|
Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]"; |
199 |
|
} |
200 |
|
} |
201 |
|
$result{$k} = $utf8v; |
202 |
|
} |
203 |
|
} |
204 |
|
if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) { |
205 |
|
while (my($k,$v) = each %result) { |
206 |
|
next unless defined($v) && length($v); |
207 |
|
my $utf8v = Encode::decode("ISO-8859-1",$v); |
208 |
|
$result{$k} = $utf8v; |
209 |
|
} |
210 |
|
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) { |
211 |
|
while (my($k,$v) = each %result) { |
212 |
|
next unless defined($v) && length($v); |
213 |
|
my $utf8v = Encode::decode_utf8($v); |
214 |
|
$result{$k} = $utf8v; |
215 |
|
} |
216 |
|
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) { |
217 |
|
while (my($k,$v) = each %result) { |
218 |
|
next unless defined($v) && length($v); |
219 |
|
utf8::upgrade($v); |
220 |
|
$result{$k} = $v; |
221 |
|
} |
222 |
|
} |
223 |
|
} |
224 |
|
|
225 |
sub split { |
sub split { |
226 |
my ($self, $doc) = @_; |
my ($self, $doc) = @_; |
227 |
my %doc = map { $_ => "" } qw(isbn author aboutauthor |
my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter |
228 |
translator abouttranslator colophon |
translator abouttranslator colophon |
229 |
abstract title subtitle title_orig toc inx); |
abstract title subtitle title_orig toc inx); |
230 |
|
|
240 |
my_parse($doc->{index}); |
my_parse($doc->{index}); |
241 |
$doc{abstract} = $result{short_desc}; |
$doc{abstract} = $result{short_desc}; |
242 |
$doc{isbn} = $result{isbn}; |
$doc{isbn} = $result{isbn}; |
243 |
$doc{author} = $result{author_names}; |
$doc{author} = $result{author_names} || ""; |
244 |
$doc{translator} = $result{translator_names}; |
$doc{translator} = $result{translator_names}; |
245 |
$doc{title} = $result{title}; |
$doc{title} = $result{title}; |
246 |
$doc{subtitle} = $result{subtitle}; |
$doc{subtitle} = $result{subtitle}; |
247 |
$doc{title_orig} = $result{title_orig}; |
$doc{title_orig} = $result{title_orig}; |
248 |
} |
} |
249 |
|
if ($doc->{chapter}) { |
250 |
|
my $content = $doc->{chapter}; |
251 |
|
my $bs; |
252 |
|
$bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si; |
253 |
|
my $es; |
254 |
|
$es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si; |
255 |
|
unless ($bs){ |
256 |
|
$content =~ s/^.*?<h1/<h1/si; |
257 |
|
} |
258 |
|
unless ($es){ |
259 |
|
$content =~ s/<HR.*//si; |
260 |
|
} |
261 |
|
$content =~ s/^/<div id="chapter">/; |
262 |
|
$content .= "</div>\n"; |
263 |
|
my_parse($content); |
264 |
|
$doc{chapter} = $result{chapter}; |
265 |
|
} |
266 |
if ($doc->{colophon}) { |
if ($doc->{colophon}) { |
267 |
my_parse($doc->{colophon}); |
my_parse($doc->{colophon}); |
268 |
$doc{colophon} = $result{colophon}; |
my $s = $doc{colophon} = $result{colophon}; |
269 |
|
# use Devel::Peek; |
270 |
|
# Devel::Peek::Dump($s); |
271 |
} |
} |
272 |
if ($doc->{toc}) { |
if ($doc->{toc}) { |
273 |
my_parse($doc->{toc}); |
my_parse($doc->{toc}); |
274 |
my $s = $result{book_toc}; |
if (my $s = $result{book_toc}) { |
275 |
$s =~ s/<BR>/ /ig; |
# $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was <BR> |
276 |
$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl |
$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl |
277 |
$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff |
$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff |
278 |
$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi |
$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi |
279 |
$doc{toc} = $s; |
$doc{toc} = $s; |
280 |
|
} else { |
281 |
|
die "toc[$doc->{toc}] not parseable?"; |
282 |
|
} |
283 |
} |
} |
284 |
if ($doc->{inx}) { |
if ($doc->{inx}) { |
285 |
my_parse($doc->{inx}); |
my_parse($doc->{inx}); |
286 |
my $s = $result{book_inx} || ""; |
my $s = $result{book_inx} || ""; |
287 |
$s =~ s/<BR>/ /ig; |
# $s =~ s/<BR>/ /ig; # wrong!, see above |
288 |
$s =~ s/&#(8211);/-/g; |
$s =~ s/&#(8211);/-/g; |
289 |
$s =~ s/&#(8220);/"/g; |
$s =~ s/&#(8220);/"/g; |
290 |
$s =~ s/&#(8222);/"/g; |
$s =~ s/&#(8222);/"/g; |
301 |
$doc{desc} = $result{long_desc}; |
$doc{desc} = $result{long_desc}; |
302 |
} |
} |
303 |
|
|
304 |
while (my($k,$v) = each %doc) { |
if (0) { |
305 |
next unless defined($v) && length($v); |
# we did really convert the stuff we just read in to UTF8 |
306 |
my $utf8v = Encode::decode("ISO-8859-1",$v); |
# (although WAIT::Document::Ora::conv_getline converts to UTF8 |
307 |
$doc{$k} = $utf8v; |
# itself), because my_parse did the conversion back to latin1. |
308 |
|
# This nonsense must stop. All routines must get and give UTF-8. |
309 |
|
# If they want to process something else internally, they must |
310 |
|
# convert twice |
311 |
|
while (my($k,$v) = each %doc) { |
312 |
|
next unless defined($v) && length($v); |
313 |
|
my $utf8v = Encode::decode("ISO-8859-1",$v); |
314 |
|
$doc{$k} = $utf8v; |
315 |
|
} |
316 |
} |
} |
317 |
# warn "ALERT: No author" unless $doc{author}; |
# warn "ALERT: No author" unless $doc{author}; |
318 |
|
|