WAIT/Parse/Ora.pm

#!/usr/bin/perl
#                              -*- Mode: Perl -*- 
# $Basename: HTML.pm $
# $Revision: 1.10 $
# Author          : Ulrich Pfeifer with Andreas König
# Created On      : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Fri Jan  4 16:06:14 2002
# Language        : CPerl
# Update Count    : 14
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# 
# 

package WAIT::Parse::Ora;
use base qw(WAIT::Parse::Base);

use HTML::Parser;
use Encode;
use strict;


=pod

Text from 2002-03-05 is structured with <div> tags as follows:

  index.html:
  <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
  <div id="short_desc"> KURZE BESCHREIBUNG

  desc.html:
  <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG

  author.html:
  <div id="author_bio"> BIOGRAPHIE DES AUTOREN

  translator.html:
  <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS

=cut

my $debug = 0;

my $globalp = HTML::Parser->new(
                                api_version => 3,
                                start_h => [\&handle_start, "tagname, attr"],
                                end_h   => [\&handle_end,   "tagname"],
                                text_h  => [\&handle_text,  "dtext"],
                                marked_sections => 1,
                               );
my %result;
my $text;
my $open;
my $div;

sub initialize_text {
  if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
    $text = "\x{100}";
  } else {
    $text = "";
  }
}

sub finished_text {
  if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
    $text =~ s/^\x{100}//;
  }
  $text =~ s/^\s+//;
  $text =~ s/\s+$//;
  $text =~ s/\s+/ /g;
  $text;
}

sub handle_start {
  my $tag = shift;
  my $attr = shift;

  return unless $tag eq "div";
  $div = $attr->{id};
  utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
  $open++;
  print ">" x $open, $tag,  "\n" if $debug;
}

sub handle_end {
  my $tag = shift;

  return unless $tag eq "div";
  print "<" x $open, $tag,  "\n" if $debug;
  $open--;
  return unless $div;
  if (defined $result{$div}){
    $result{$div} .= " " . finished_text();
  } else {
    $result{$div} = finished_text();
  }
  initialize_text();
}


sub handle_text {
  my $c = shift;
  $text .= $c if $open;
}


# WAIT::Parse::Ora::my_parse
sub my_parse ($) {
  my($s) = @_;
  my $ls;
  if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
    warn "Warning: this HTML::Parser has Unicode support on"
        if HTML::Entities::UNICODE_SUPPORT();
    $ls = Encode::encode("ISO-8859-1", $s, 1);
    # HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities.
    # We reduce the amount of mixed-encoding content to just a few
    # punctuation characters when we work with Latin here.

  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
    $ls = Encode::encode_utf8($s);
  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
    $ls = $s;
    $ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
    utf8::downgrade($ls); # didn't improve the coredumpness
    die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
    # warn "ls[$ls]";
  } else {
    $ls = $s;
  }
  %result = ();
  initialize_text();
  $open = 0;
  $div = undef;
  if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
    # code that directly deals with $s because it doesn't want the
    # conversion to $ls (latin1)

    $s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
    my $pre_s = $s;
    # $s =~ s/&#153;//g;
    # $s =~ s/\302\255//g; # 2.8 pounds in 
    if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
      # Should be handled by HTML::Entities, numeric entities and
      # HTML::Entities and -DUNICODE_ENTITIES => core dump

      my $saidinfo = 0;
      local $| = 1;
      while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
        print "Info:" unless $saidinfo++;
        print " &#$1;";
      }
      print "\n" if $saidinfo;

    }
  }
  if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
    my $p = HTML::Parser->new(
                              api_version => 3,
                              start_h => [\&handle_start, "tagname, attr"],
                              end_h   => [\&handle_end,   "tagname"],
                              text_h  => [\&handle_text,  "dtext"],
                              marked_sections => 1,
                             );
    $p->parse($ls);
    $p->eof;
  } else {
    $globalp->parse($ls);
    $globalp->eof;
  }
  if (0) { # XXX
    # code that tries to postprocess the nonsense resulting from the above
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      next if Encode::is_utf8($v);
      next unless $v =~ /[^\040-\177]/;
      # Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
      my $utf8v;
      if (HTML::Entities::UNICODE_SUPPORT()) {
        if (0) {
          # klappt nicht == 2002-04-02
          $utf8v = Encode::decode("ISO-8859-1",$v);
        } elsif (1) {
          $utf8v = $v;
          # fuehrt zu "unexpected downgraded strings" und die haben dann
          # noch ein UTF-8 Teile, die nicht als solche markiert sind
        }
      } else {
        # Want to find out which condition we need to watch
        if ($HTML::Parser::VERSION != 3.26) {
          # klappt nicht == 2002-04-02
          $utf8v = Encode::decode("ISO-8859-1",$v);
        } else {
          # klappt nicht == 2002-04-02
          $utf8v = $v;
          Encode::_utf8_on($utf8v);
          Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
        }
      }
      $result{$k} = $utf8v;
    }
  }
  if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      my $utf8v = Encode::decode("ISO-8859-1",$v);
      $result{$k} = $utf8v;
    }
  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      my $utf8v = Encode::decode_utf8($v);
      $result{$k} = $utf8v;
    }
  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      utf8::upgrade($v);
      $result{$k} = $v;
    }
  }
}

sub split {
  my ($self, $doc) = @_;
  my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
                                translator abouttranslator colophon
                                abstract title subtitle title_orig toc inx);

  if ($doc->{author}) {
    my_parse($doc->{author});
    $doc{aboutauthor}  = $result{author_bio};
  }
  if ($doc->{translator}) {
    my_parse($doc->{translator});
    $doc{abouttranslator}  = $result{translator_bio};
  }
  if ($doc->{index}) {
    my_parse($doc->{index});
    $doc{abstract} = $result{short_desc};
    $doc{isbn} = $result{isbn};
    $doc{author} = $result{author_names} || "";
    $doc{translator} = $result{translator_names};
    $doc{title} = $result{title};
    $doc{subtitle} = $result{subtitle};
    $doc{title_orig} = $result{title_orig};
  }
  if ($doc->{chapter}) {
    my $content = $doc->{chapter};
    my $bs;
    $bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si;
    my $es;
    $es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si;
    unless ($bs){
      $content =~ s/^.*?<h1/<h1/si;
    }
    unless ($es){
      $content =~ s/<HR.*//si;
    }
    $content =~ s/^/<div id="chapter">/;
    $content .= "</div>\n";
    my_parse($content);
    $doc{chapter} = $result{chapter};
  }
  if ($doc->{colophon}) {
    my_parse($doc->{colophon});
    my $s = $doc{colophon} = $result{colophon};
    # use Devel::Peek;
    # Devel::Peek::Dump($s);
  }
  if ($doc->{toc}) {
    my_parse($doc->{toc});
    if (my $s = $result{book_toc}) {
      # $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was &lt;BR&gt;
      $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
      $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
      $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
      $doc{toc} = $s;
    } else {
      die "toc[$doc->{toc}] not parseable?";
    }
  }
  if ($doc->{inx}) {
    my_parse($doc->{inx});
    my $s = $result{book_inx} || "";
    # $s =~ s/<BR>/ /ig; # wrong!, see above
    $s =~ s/&#(8211);/-/g;
    $s =~ s/&#(8220);/"/g;
    $s =~ s/&#(8222);/"/g;
    $s =~ s/&#(8217);/'/g;
    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
    $s =~ s/\s*,\s+/ /g; # Komma
    1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
    1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
    $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
    $doc{inx} = $s;
  }
  if ($doc->{desc}) {
    my_parse($doc->{desc});
    $doc{desc}  = $result{long_desc};
  }

  if (0) {
    # we did really convert the stuff we just read in to UTF8
    # (although WAIT::Document::Ora::conv_getline converts to UTF8
    # itself), because my_parse did the conversion back to latin1.
    # This nonsense must stop. All routines must get and give UTF-8.
    # If they want to process something else internally, they must
    # convert twice
    while (my($k,$v) = each %doc) {
      next unless defined($v) && length($v);
      my $utf8v = Encode::decode("ISO-8859-1",$v);
      $doc{$k} = $utf8v;
    }
  }
  # warn "ALERT: No author" unless $doc{author};

  return \%doc;
}

1;
1	#!/usr/bin/perl
2	# -- Mode: Perl --
3	# $Basename: HTML.pm $
4	# $Revision: 1.10 $
5	# Author : Ulrich Pfeifer with Andreas König
6	# Created On : Sat Nov 1 1997
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Fri Jan 4 16:06:14 2002
9	# Language : CPerl
10	# Update Count : 14
11	# Status : Unknown, Use with caution!
12	#
13	# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14	#
15	#
16
17	package WAIT::Parse::Ora;
18	use base qw(WAIT::Parse::Base);
19
20	use HTML::Parser;
21	use Encode;
22	use strict;
23
24
25	=pod
26
27	Text from 2002-03-05 is structured with <div> tags as follows:
28
29	index.html:
30	<div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31	<div id="short_desc"> KURZE BESCHREIBUNG
32
33	desc.html:
34	<div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36	author.html:
37	<div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39	translator.html:
40	<div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42	=cut
43
44	my $debug = 0;
45
46	my $globalp = HTML::Parser->new(
47	api_version => 3,
48	start_h => [\&handle_start, "tagname, attr"],
49	end_h => [\&handle_end, "tagname"],
50	text_h => [\&handle_text, "dtext"],
51	marked_sections => 1,
52	);
53	my %result;
54	my $text;
55	my $open;
56	my $div;
57
58	sub initialize_text {
59	if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
60	$text = "\x{100}";
61	} else {
62	$text = "";
63	}
64	}
65
66	sub finished_text {
67	if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
68	$text =~ s/^\x{100}//;
69	}
70	$text =~ s/^\s+//;
71	$text =~ s/\s+$//;
72	$text =~ s/\s+/ /g;
73	$text;
74	}
75
76	sub handle_start {
77	my $tag = shift;
78	my $attr = shift;
79
80	return unless $tag eq "div";
81	$div = $attr->{id};
82	utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
83	$open++;
84	print ">" x $open, $tag, "\n" if $debug;
85	}
86
87	sub handle_end {
88	my $tag = shift;
89
90	return unless $tag eq "div";
91	print "<" x $open, $tag, "\n" if $debug;
92	$open--;
93	return unless $div;
94	if (defined $result{$div}){
95	$result{$div} .= " " . finished_text();
96	} else {
97	$result{$div} = finished_text();
98	}
99	initialize_text();
100	}
101
102
103	sub handle_text {
104	my $c = shift;
105	$text .= $c if $open;
106	}
107
108
109	# WAIT::Parse::Ora::my_parse
110	sub my_parse ($) {
111	my($s) = @_;
112	my $ls;
113	if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
114	warn "Warning: this HTML::Parser has Unicode support on"
115	if HTML::Entities::UNICODE_SUPPORT();
116	$ls = Encode::encode("ISO-8859-1", $s, 1);
117	# HTML::Parser 3.25, 3.26 returns mostly LATIN for entities.
118	# We reduce the amount of mixed-encoding content to just a few
119	# punctuation characters when we work with Latin here.
120
121	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
122	$ls = Encode::encode_utf8($s);
123	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
124	$ls = $s;
125	$ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
126	utf8::downgrade($ls); # didn't improve the coredumpness
127	die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
128	# warn "ls[$ls]";
129	} else {
130	$ls = $s;
131	}
132	%result = ();
133	initialize_text();
134	$open = 0;
135	$div = undef;
136	if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
137	# code that directly deals with $s because it doesn't want the
138	# conversion to $ls (latin1)
139
140	$s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
141	my $pre_s = $s;
142	# $s =~ s///g;
143	# $s =~ s/\302\255//g; # 2.8 pounds in
144	if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
145	# Should be handled by HTML::Entities, numeric entities and
146	# HTML::Entities and -DUNICODE_ENTITIES => core dump
147
148	my $saidinfo = 0;
149	local $\| = 1;
150	while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
151	print "Info:" unless $saidinfo++;
152	print " &#$1;";
153	}
154	print "\n" if $saidinfo;
155
156	}
157	}
158	if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
159	my $p = HTML::Parser->new(
160	api_version => 3,
161	start_h => [\&handle_start, "tagname, attr"],
162	end_h => [\&handle_end, "tagname"],
163	text_h => [\&handle_text, "dtext"],
164	marked_sections => 1,
165	);
166	$p->parse($ls);
167	$p->eof;
168	} else {
169	$globalp->parse($ls);
170	$globalp->eof;
171	}
172	if (0) { # XXX
173	# code that tries to postprocess the nonsense resulting from the above
174	while (my($k,$v) = each %result) {
175	next unless defined($v) && length($v);
176	next if Encode::is_utf8($v);
177	next unless $v =~ /[^\040-\177]/;
178	# Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
179	my $utf8v;
180	if (HTML::Entities::UNICODE_SUPPORT()) {
181	if (0) {
182	# klappt nicht == 2002-04-02
183	$utf8v = Encode::decode("ISO-8859-1",$v);
184	} elsif (1) {
185	$utf8v = $v;
186	# fuehrt zu "unexpected downgraded strings" und die haben dann
187	# noch ein UTF-8 Teile, die nicht als solche markiert sind
188	}
189	} else {
190	# Want to find out which condition we need to watch
191	if ($HTML::Parser::VERSION != 3.26) {
192	# klappt nicht == 2002-04-02
193	$utf8v = Encode::decode("ISO-8859-1",$v);
194	} else {
195	# klappt nicht == 2002-04-02
196	$utf8v = $v;
197	Encode::_utf8_on($utf8v);
198	Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
199	}
200	}
201	$result{$k} = $utf8v;
202	}
203	}
204	if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
205	while (my($k,$v) = each %result) {
206	next unless defined($v) && length($v);
207	my $utf8v = Encode::decode("ISO-8859-1",$v);
208	$result{$k} = $utf8v;
209	}
210	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
211	while (my($k,$v) = each %result) {
212	next unless defined($v) && length($v);
213	my $utf8v = Encode::decode_utf8($v);
214	$result{$k} = $utf8v;
215	}
216	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
217	while (my($k,$v) = each %result) {
218	next unless defined($v) && length($v);
219	utf8::upgrade($v);
220	$result{$k} = $v;
221	}
222	}
223	}
224
225	sub split {
226	my ($self, $doc) = @_;
227	my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
228	translator abouttranslator colophon
229	abstract title subtitle title_orig toc inx);
230
231	if ($doc->{author}) {
232	my_parse($doc->{author});
233	$doc{aboutauthor} = $result{author_bio};
234	}
235	if ($doc->{translator}) {
236	my_parse($doc->{translator});
237	$doc{abouttranslator} = $result{translator_bio};
238	}
239	if ($doc->{index}) {
240	my_parse($doc->{index});
241	$doc{abstract} = $result{short_desc};
242	$doc{isbn} = $result{isbn};
243	$doc{author} = $result{author_names} \|\| "";
244	$doc{translator} = $result{translator_names};
245	$doc{title} = $result{title};
246	$doc{subtitle} = $result{subtitle};
247	$doc{title_orig} = $result{title_orig};
248	}
249	if ($doc->{chapter}) {
250	my $content = $doc->{chapter};
251	my $bs;
252	$bs++ if $content =~ s/^.?<!--\ssample chapter (begins (here )?)?-->//si;
253	my $es;
254	$es++ if $content =~ s/<!--\s(End of )?sample chapter (ends here )?-->.//si;
255	unless ($bs){
256	$content =~ s/^.*?<h1/<h1/si;
257	}
258	unless ($es){
259	$content =~ s/<HR.*//si;
260	}
261	$content =~ s/^/<div id="chapter">/;
262	$content .= "</div>\n";
263	my_parse($content);
264	$doc{chapter} = $result{chapter};
265	}
266	if ($doc->{colophon}) {
267	my_parse($doc->{colophon});
268	my $s = $doc{colophon} = $result{colophon};
269	# use Devel::Peek;
270	# Devel::Peek::Dump($s);
271	}
272	if ($doc->{toc}) {
273	my_parse($doc->{toc});
274	if (my $s = $result{book_toc}) {
275	# $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was <BR>
276	$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
277	$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
278	$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
279	$doc{toc} = $s;
280	} else {
281	die "toc[$doc->{toc}] not parseable?";
282	}
283	}
284	if ($doc->{inx}) {
285	my_parse($doc->{inx});
286	my $s = $result{book_inx} \|\| "";
287	# $s =~ s/<BR>/ /ig; # wrong!, see above
288	$s =~ s/&#(8211);/-/g;
289	$s =~ s/&#(8220);/"/g;
290	$s =~ s/&#(8222);/"/g;
291	$s =~ s/&#(8217);/'/g;
292	$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
293	$s =~ s/\s*,\s+/ /g; # Komma
294	1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê)
295	1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
296	$s =~ s/(\w+)\( \)/$1()/g; # functions in the index
297	$doc{inx} = $s;
298	}
299	if ($doc->{desc}) {
300	my_parse($doc->{desc});
301	$doc{desc} = $result{long_desc};
302	}
303
304	if (0) {
305	# we did really convert the stuff we just read in to UTF8
306	# (although WAIT::Document::Ora::conv_getline converts to UTF8
307	# itself), because my_parse did the conversion back to latin1.
308	# This nonsense must stop. All routines must get and give UTF-8.
309	# If they want to process something else internally, they must
310	# convert twice
311	while (my($k,$v) = each %doc) {
312	next unless defined($v) && length($v);
313	my $utf8v = Encode::decode("ISO-8859-1",$v);
314	$doc{$k} = $utf8v;
315	}
316	}
317	# warn "ALERT: No author" unless $doc{author};
318
319	return \%doc;
320	}
321
322	1;