WAIT/Parse/Ora.pm

#!/usr/bin/perl
#                              -*- Mode: Perl -*- 
# $Basename: HTML.pm $
# $Revision: 1.8 $
# Author          : Ulrich Pfeifer with Andreas König
# Created On      : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Fri Jan  4 16:06:14 2002
# Language        : CPerl
# Update Count    : 14
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# 
# 

package WAIT::Parse::Ora;
use base qw(WAIT::Parse::Base);

use HTML::Parser;
use Encode;
use strict;


=pod

Text from 2002-03-05 is structured with <div> tags as follows:

  index.html:
  <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
  <div id="short_desc"> KURZE BESCHREIBUNG

  desc.html:
  <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG

  author.html:
  <div id="author_bio"> BIOGRAPHIE DES AUTOREN

  translator.html:
  <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS

=cut

my $debug = 0;

my $p = HTML::Parser->new(
                          api_version => 3,
                          start_h => [\&handle_start, "tagname, attr"],
                          end_h   => [\&handle_end,   "tagname"],
                          text_h  => [\&handle_text,  "dtext"],
                          marked_sections => 1,
                         );
my %result;
my $text;
my $open;
my $div;

sub handle_start {
  my $tag = shift;
  my $attr = shift;

  return unless $tag eq "div";
  $div = $attr->{id};
  $open++;
  print ">" x $open, $tag,  "\n" if $debug;
}

sub handle_end {
  my $tag = shift;

  return unless $tag eq "div";
  print "<" x $open, $tag,  "\n" if $debug;
  $open--;
  $text =~ s/^\s+//;
  $text =~ s/\s+$//;
  $text =~ s/\s+/ /g;
  $result{$div} .= $text . ' ';
  $text = '';
}


sub handle_text {
  my $c = shift;
  $text .= $c if $open;
}


sub my_parse ($) {
  my($s) = @_;
  my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
                                                # LATIN for entities
                                                # and we would get
                                                # mixed content in
                                                # result
  %result = ();
  $text = '';
  $open = 0;
  $div = undef;
  $p->parse($ls);
  $p->eof;
}


sub split {
  my ($self, $doc) = @_;
  my %doc = map { $_ => "" } qw(isbn author aboutauthor
                                translator abouttranslator colophon
                                abstract title subtitle title_orig toc inx);

  if ($doc->{author}) {
    my_parse($doc->{author});
    $doc{aboutauthor}  = $result{author_bio};
  }
  if ($doc->{translator}) {
    my_parse($doc->{translator});
    $doc{abouttranslator}  = $result{translator_bio};
  }
  if ($doc->{index}) {
    my_parse($doc->{index});
    $doc{abstract} = $result{short_desc};
    $doc{isbn} = $result{isbn};
    $doc{author} = $result{author_names};
    $doc{translator} = $result{translator_names};
    $doc{title} = $result{title};
    $doc{subtitle} = $result{subtitle};
    $doc{title_orig} = $result{title_orig};
  }
  if ($doc->{colophon}) {
    my_parse($doc->{colophon});
    $doc{colophon} = $result{colophon};
  }
  if ($doc->{toc}) {
    my_parse($doc->{toc});
    my $s = $result{book_toc};
    $s =~ s/<BR>/ /ig;
    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
    $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
    $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
    $doc{toc} = $s;
  }
  if ($doc->{inx}) {
    my_parse($doc->{inx});
    my $s = $result{book_inx};
    $s =~ s/<BR>/ /ig;
    $s =~ s/&#(8211);/-/g;
    $s =~ s/&#(8220);/"/g;
    $s =~ s/&#(8222);/"/g;
    $s =~ s/&#(8217);/'/g;
    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
    $s =~ s/\s*,\s+/ /g; # Komma
    1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
    1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
    $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
    $doc{inx} = $s;
  }
  if ($doc->{desc}) {
    my_parse($doc->{desc});
    $doc{desc}  = $result{long_desc};
  }

  while (my($k,$v) = each %doc) {
    next unless defined($v) && length($v);
    my $utf8v = Encode::decode("ISO-8859-1",$v);
    $doc{$k} = $utf8v;
  }
  # warn "ALERT: No author" unless $doc{author};

  return \%doc;
}

1;
1	#!/usr/bin/perl
2	# -- Mode: Perl --
3	# $Basename: HTML.pm $
4	# $Revision: 1.8 $
5	# Author : Ulrich Pfeifer with Andreas König
6	# Created On : Sat Nov 1 1997
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Fri Jan 4 16:06:14 2002
9	# Language : CPerl
10	# Update Count : 14
11	# Status : Unknown, Use with caution!
12	#
13	# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14	#
15	#
16
17	package WAIT::Parse::Ora;
18	use base qw(WAIT::Parse::Base);
19
20	use HTML::Parser;
21	use Encode;
22	use strict;
23
24
25	=pod
26
27	Text from 2002-03-05 is structured with <div> tags as follows:
28
29	index.html:
30	<div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31	<div id="short_desc"> KURZE BESCHREIBUNG
32
33	desc.html:
34	<div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36	author.html:
37	<div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39	translator.html:
40	<div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42	=cut
43
44	my $debug = 0;
45
46	my $p = HTML::Parser->new(
47	api_version => 3,
48	start_h => [\&handle_start, "tagname, attr"],
49	end_h => [\&handle_end, "tagname"],
50	text_h => [\&handle_text, "dtext"],
51	marked_sections => 1,
52	);
53	my %result;
54	my $text;
55	my $open;
56	my $div;
57
58	sub handle_start {
59	my $tag = shift;
60	my $attr = shift;
61
62	return unless $tag eq "div";
63	$div = $attr->{id};
64	$open++;
65	print ">" x $open, $tag, "\n" if $debug;
66	}
67
68	sub handle_end {
69	my $tag = shift;
70
71	return unless $tag eq "div";
72	print "<" x $open, $tag, "\n" if $debug;
73	$open--;
74	$text =~ s/^\s+//;
75	$text =~ s/\s+$//;
76	$text =~ s/\s+/ /g;
77	$result{$div} .= $text . ' ';
78	$text = '';
79	}
80
81
82	sub handle_text {
83	my $c = shift;
84	$text .= $c if $open;
85	}
86
87
88	sub my_parse ($) {
89	my($s) = @_;
90	my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
91	# LATIN for entities
92	# and we would get
93	# mixed content in
94	# result
95	%result = ();
96	$text = '';
97	$open = 0;
98	$div = undef;
99	$p->parse($ls);
100	$p->eof;
101	}
102
103
104	sub split {
105	my ($self, $doc) = @_;
106	my %doc = map { $_ => "" } qw(isbn author aboutauthor
107	translator abouttranslator colophon
108	abstract title subtitle title_orig toc inx);
109
110	if ($doc->{author}) {
111	my_parse($doc->{author});
112	$doc{aboutauthor} = $result{author_bio};
113	}
114	if ($doc->{translator}) {
115	my_parse($doc->{translator});
116	$doc{abouttranslator} = $result{translator_bio};
117	}
118	if ($doc->{index}) {
119	my_parse($doc->{index});
120	$doc{abstract} = $result{short_desc};
121	$doc{isbn} = $result{isbn};
122	$doc{author} = $result{author_names};
123	$doc{translator} = $result{translator_names};
124	$doc{title} = $result{title};
125	$doc{subtitle} = $result{subtitle};
126	$doc{title_orig} = $result{title_orig};
127	}
128	if ($doc->{colophon}) {
129	my_parse($doc->{colophon});
130	$doc{colophon} = $result{colophon};
131	}
132	if ($doc->{toc}) {
133	my_parse($doc->{toc});
134	my $s = $result{book_toc};
135	$s =~ s/<BR>/ /ig;
136	$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
137	$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
138	$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
139	$doc{toc} = $s;
140	}
141	if ($doc->{inx}) {
142	my_parse($doc->{inx});
143	my $s = $result{book_inx};
144	$s =~ s/<BR>/ /ig;
145	$s =~ s/&#(8211);/-/g;
146	$s =~ s/&#(8220);/"/g;
147	$s =~ s/&#(8222);/"/g;
148	$s =~ s/&#(8217);/'/g;
149	$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
150	$s =~ s/\s*,\s+/ /g; # Komma
151	1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê)
152	1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
153	$s =~ s/(\w+)\( \)/$1()/g; # functions in the index
154	$doc{inx} = $s;
155	}
156	if ($doc->{desc}) {
157	my_parse($doc->{desc});
158	$doc{desc} = $result{long_desc};
159	}
160
161	while (my($k,$v) = each %doc) {
162	next unless defined($v) && length($v);
163	my $utf8v = Encode::decode("ISO-8859-1",$v);
164	$doc{$k} = $utf8v;
165	}
166	# warn "ALERT: No author" unless $doc{author};
167
168	return \%doc;
169	}
170
171	1;