/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 76 - (show annotations)
Sat Apr 6 19:00:54 2002 UTC (22 years, 2 months ago) by laperla
File size: 9511 byte(s)
- Makefile.PL: remove spurious trailing slash

- Oreilly bugfixes, new article per book "chapter/index.html" and many
  switches for HTML::Parser debugging

1 #!/usr/bin/perl
2 # -*- Mode: Perl -*-
3 # $Basename: HTML.pm $
4 # $Revision: 1.10 $
5 # Author : Ulrich Pfeifer with Andreas König
6 # Created On : Sat Nov 1 1997
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 16:06:14 2002
9 # Language : CPerl
10 # Update Count : 14
11 # Status : Unknown, Use with caution!
12 #
13 # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14 #
15 #
16
17 package WAIT::Parse::Ora;
18 use base qw(WAIT::Parse::Base);
19
20 use HTML::Parser;
21 use Encode;
22 use strict;
23
24
25 =pod
26
27 Text from 2002-03-05 is structured with <div> tags as follows:
28
29 index.html:
30 <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31 <div id="short_desc"> KURZE BESCHREIBUNG
32
33 desc.html:
34 <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36 author.html:
37 <div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39 translator.html:
40 <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42 =cut
43
44 my $debug = 0;
45
46 my $globalp = HTML::Parser->new(
47 api_version => 3,
48 start_h => [\&handle_start, "tagname, attr"],
49 end_h => [\&handle_end, "tagname"],
50 text_h => [\&handle_text, "dtext"],
51 marked_sections => 1,
52 );
53 my %result;
54 my $text;
55 my $open;
56 my $div;
57
58 sub initialize_text {
59 if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
60 $text = "\x{100}";
61 } else {
62 $text = "";
63 }
64 }
65
66 sub finished_text {
67 if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
68 $text =~ s/^\x{100}//;
69 }
70 $text =~ s/^\s+//;
71 $text =~ s/\s+$//;
72 $text =~ s/\s+/ /g;
73 $text;
74 }
75
76 sub handle_start {
77 my $tag = shift;
78 my $attr = shift;
79
80 return unless $tag eq "div";
81 $div = $attr->{id};
82 utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
83 $open++;
84 print ">" x $open, $tag, "\n" if $debug;
85 }
86
87 sub handle_end {
88 my $tag = shift;
89
90 return unless $tag eq "div";
91 print "<" x $open, $tag, "\n" if $debug;
92 $open--;
93 return unless $div;
94 if (defined $result{$div}){
95 $result{$div} .= " " . finished_text();
96 } else {
97 $result{$div} = finished_text();
98 }
99 initialize_text();
100 }
101
102
103 sub handle_text {
104 my $c = shift;
105 $text .= $c if $open;
106 }
107
108
109 # WAIT::Parse::Ora::my_parse
110 sub my_parse ($) {
111 my($s) = @_;
112 my $ls;
113 if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
114 warn "Warning: this HTML::Parser has Unicode support on"
115 if HTML::Entities::UNICODE_SUPPORT();
116 $ls = Encode::encode("ISO-8859-1", $s, 1);
117 # HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities.
118 # We reduce the amount of mixed-encoding content to just a few
119 # punctuation characters when we work with Latin here.
120
121 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
122 $ls = Encode::encode_utf8($s);
123 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
124 $ls = $s;
125 $ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
126 utf8::downgrade($ls); # didn't improve the coredumpness
127 die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
128 # warn "ls[$ls]";
129 } else {
130 $ls = $s;
131 }
132 %result = ();
133 initialize_text();
134 $open = 0;
135 $div = undef;
136 if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
137 # code that directly deals with $s because it doesn't want the
138 # conversion to $ls (latin1)
139
140 $s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
141 my $pre_s = $s;
142 # $s =~ s/&#153;//g;
143 # $s =~ s/\302\255//g; # 2.8 pounds in
144 if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
145 # Should be handled by HTML::Entities, numeric entities and
146 # HTML::Entities and -DUNICODE_ENTITIES => core dump
147
148 my $saidinfo = 0;
149 local $| = 1;
150 while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
151 print "Info:" unless $saidinfo++;
152 print " &#$1;";
153 }
154 print "\n" if $saidinfo;
155
156 }
157 }
158 if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
159 my $p = HTML::Parser->new(
160 api_version => 3,
161 start_h => [\&handle_start, "tagname, attr"],
162 end_h => [\&handle_end, "tagname"],
163 text_h => [\&handle_text, "dtext"],
164 marked_sections => 1,
165 );
166 $p->parse($ls);
167 $p->eof;
168 } else {
169 $globalp->parse($ls);
170 $globalp->eof;
171 }
172 if (0) { # XXX
173 # code that tries to postprocess the nonsense resulting from the above
174 while (my($k,$v) = each %result) {
175 next unless defined($v) && length($v);
176 next if Encode::is_utf8($v);
177 next unless $v =~ /[^\040-\177]/;
178 # Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
179 my $utf8v;
180 if (HTML::Entities::UNICODE_SUPPORT()) {
181 if (0) {
182 # klappt nicht == 2002-04-02
183 $utf8v = Encode::decode("ISO-8859-1",$v);
184 } elsif (1) {
185 $utf8v = $v;
186 # fuehrt zu "unexpected downgraded strings" und die haben dann
187 # noch ein UTF-8 Teile, die nicht als solche markiert sind
188 }
189 } else {
190 # Want to find out which condition we need to watch
191 if ($HTML::Parser::VERSION != 3.26) {
192 # klappt nicht == 2002-04-02
193 $utf8v = Encode::decode("ISO-8859-1",$v);
194 } else {
195 # klappt nicht == 2002-04-02
196 $utf8v = $v;
197 Encode::_utf8_on($utf8v);
198 Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
199 }
200 }
201 $result{$k} = $utf8v;
202 }
203 }
204 if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
205 while (my($k,$v) = each %result) {
206 next unless defined($v) && length($v);
207 my $utf8v = Encode::decode("ISO-8859-1",$v);
208 $result{$k} = $utf8v;
209 }
210 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
211 while (my($k,$v) = each %result) {
212 next unless defined($v) && length($v);
213 my $utf8v = Encode::decode_utf8($v);
214 $result{$k} = $utf8v;
215 }
216 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
217 while (my($k,$v) = each %result) {
218 next unless defined($v) && length($v);
219 utf8::upgrade($v);
220 $result{$k} = $v;
221 }
222 }
223 }
224
225 sub split {
226 my ($self, $doc) = @_;
227 my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
228 translator abouttranslator colophon
229 abstract title subtitle title_orig toc inx);
230
231 if ($doc->{author}) {
232 my_parse($doc->{author});
233 $doc{aboutauthor} = $result{author_bio};
234 }
235 if ($doc->{translator}) {
236 my_parse($doc->{translator});
237 $doc{abouttranslator} = $result{translator_bio};
238 }
239 if ($doc->{index}) {
240 my_parse($doc->{index});
241 $doc{abstract} = $result{short_desc};
242 $doc{isbn} = $result{isbn};
243 $doc{author} = $result{author_names} || "";
244 $doc{translator} = $result{translator_names};
245 $doc{title} = $result{title};
246 $doc{subtitle} = $result{subtitle};
247 $doc{title_orig} = $result{title_orig};
248 }
249 if ($doc->{chapter}) {
250 my $content = $doc->{chapter};
251 my $bs;
252 $bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si;
253 my $es;
254 $es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si;
255 unless ($bs){
256 $content =~ s/^.*?<h1/<h1/si;
257 }
258 unless ($es){
259 $content =~ s/<HR.*//si;
260 }
261 $content =~ s/^/<div id="chapter">/;
262 $content .= "</div>\n";
263 my_parse($content);
264 $doc{chapter} = $result{chapter};
265 }
266 if ($doc->{colophon}) {
267 my_parse($doc->{colophon});
268 my $s = $doc{colophon} = $result{colophon};
269 # use Devel::Peek;
270 # Devel::Peek::Dump($s);
271 }
272 if ($doc->{toc}) {
273 my_parse($doc->{toc});
274 if (my $s = $result{book_toc}) {
275 # $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was &lt;BR&gt;
276 $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
277 $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
278 $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
279 $doc{toc} = $s;
280 } else {
281 die "toc[$doc->{toc}] not parseable?";
282 }
283 }
284 if ($doc->{inx}) {
285 my_parse($doc->{inx});
286 my $s = $result{book_inx} || "";
287 # $s =~ s/<BR>/ /ig; # wrong!, see above
288 $s =~ s/&#(8211);/-/g;
289 $s =~ s/&#(8220);/"/g;
290 $s =~ s/&#(8222);/"/g;
291 $s =~ s/&#(8217);/'/g;
292 $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
293 $s =~ s/\s*,\s+/ /g; # Komma
294 1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
295 1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
296 $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
297 $doc{inx} = $s;
298 }
299 if ($doc->{desc}) {
300 my_parse($doc->{desc});
301 $doc{desc} = $result{long_desc};
302 }
303
304 if (0) {
305 # we did really convert the stuff we just read in to UTF8
306 # (although WAIT::Document::Ora::conv_getline converts to UTF8
307 # itself), because my_parse did the conversion back to latin1.
308 # This nonsense must stop. All routines must get and give UTF-8.
309 # If they want to process something else internally, they must
310 # convert twice
311 while (my($k,$v) = each %doc) {
312 next unless defined($v) && length($v);
313 my $utf8v = Encode::decode("ISO-8859-1",$v);
314 $doc{$k} = $utf8v;
315 }
316 }
317 # warn "ALERT: No author" unless $doc{author};
318
319 return \%doc;
320 }
321
322 1;

Properties

Name Value
cvs2svn:cvs-rev 1.10

  ViewVC Help
Powered by ViewVC 1.1.26