WAIT/Parse/Ora.pm

#!/usr/bin/perl
#                              -*- Mode: Perl -*- 
# $Basename: HTML.pm $
# $Revision: 1.8 $
# Author          : Ulrich Pfeifer with Andreas König
# Created On      : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Fri Jan  4 16:06:14 2002
# Language        : CPerl
# Update Count    : 14
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# 
# 

package WAIT::Parse::Ora;
use base qw(WAIT::Parse::Base);

use HTML::Parser;
use Encode;
use strict;


=pod

Text from 2002-03-05 is structured with <div> tags as follows:

  index.html:
  <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
  <div id="short_desc"> KURZE BESCHREIBUNG

  desc.html:
  <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG

  author.html:
  <div id="author_bio"> BIOGRAPHIE DES AUTOREN

  translator.html:
  <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS

=cut

my $debug = 0;

my $p = HTML::Parser->new(
                          api_version => 3,
                          start_h => [\&handle_start, "tagname, attr"],
                          end_h   => [\&handle_end,   "tagname"],
                          text_h  => [\&handle_text,  "dtext"],
                          marked_sections => 1,
                         );
my %result;
my $text;
my $open;
my $div;

sub handle_start {
  my $tag = shift;
  my $attr = shift;

  return unless $tag eq "div";
  $div = $attr->{id};
  $open++;
  print ">" x $open, $tag,  "\n" if $debug;
}

sub handle_end {
  my $tag = shift;

  return unless $tag eq "div";
  print "<" x $open, $tag,  "\n" if $debug;
  $open--;
  $text =~ s/^\s+//;
  $text =~ s/\s+$//;
  $text =~ s/\s+/ /g;
  $result{$div} .= $text . ' ';
  $text = '';
}


sub handle_text {
  my $c = shift;
  $text .= $c if $open;
}


sub my_parse ($) {
  my($s) = @_;
  my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
                                                # LATIN for entities
                                                # and we would get
                                                # mixed content in
                                                # result
  %result = ();
  $text = '';
  $open = 0;
  $div = undef;
  $p->parse($ls);
  $p->eof;
}


sub split {
  my ($self, $doc) = @_;
  my %doc = map { $_ => "" } qw(isbn author aboutauthor
                                translator abouttranslator colophon
                                abstract title subtitle title_orig toc inx);

  if ($doc->{author}) {
    my_parse($doc->{author});
    $doc{aboutauthor}  = $result{author_bio};
  }
  if ($doc->{translator}) {
    my_parse($doc->{translator});
    $doc{abouttranslator}  = $result{translator_bio};
  }
  if ($doc->{index}) {
    my_parse($doc->{index});
    $doc{abstract} = $result{short_desc};
    $doc{isbn} = $result{isbn};
    $doc{author} = $result{author_names};
    $doc{translator} = $result{translator_names};
    $doc{title} = $result{title};
    $doc{subtitle} = $result{subtitle};
    $doc{title_orig} = $result{title_orig};
  }
  if ($doc->{colophon}) {
    my_parse($doc->{colophon});
    $doc{colophon} = $result{colophon};
  }
  if ($doc->{toc}) {
    my_parse($doc->{toc});
    my $s = $result{book_toc};
    $s =~ s/<BR>/ /ig;
    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
    $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
    $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
    $doc{toc} = $s;
  }
  if ($doc->{inx}) {
    my_parse($doc->{inx});
    my $s = $result{book_inx};
    $s =~ s/<BR>/ /ig;
    $s =~ s/&#(8211);/-/g;
    $s =~ s/&#(8220);/"/g;
    $s =~ s/&#(8222);/"/g;
    $s =~ s/&#(8217);/'/g;
    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
    $s =~ s/\s*,\s+/ /g; # Komma
    1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
    1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
    $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
    $doc{inx} = $s;
  }
  if ($doc->{desc}) {
    my_parse($doc->{desc});
    $doc{desc}  = $result{long_desc};
  }

  while (my($k,$v) = each %doc) {
    next unless defined($v) && length($v);
    my $utf8v = Encode::decode("ISO-8859-1",$v);
    $doc{$k} = $utf8v;
  }
  # warn "ALERT: No author" unless $doc{author};

  return \%doc;
}

1;
1	ulpfr	54	#!/usr/bin/perl
2			# -- Mode: Perl --
3			# $Basename: HTML.pm $
4	laperla	74	# $Revision: 1.8 $
5	ulpfr	54	# Author : Ulrich Pfeifer with Andreas König
6			# Created On : Sat Nov 1 1997
7			# Last Modified By: Ulrich Pfeifer
8	ulpfr	61	# Last Modified On: Fri Jan 4 16:06:14 2002
9	ulpfr	54	# Language : CPerl
10	ulpfr	61	# Update Count : 14
11	ulpfr	54	# Status : Unknown, Use with caution!
12			#
13			# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14			#
15			#
16
17			package WAIT::Parse::Ora;
18	laperla	74	use base qw(WAIT::Parse::Base);
19
20	ulpfr	58	use HTML::Parser;
21	laperla	69	use Encode;
22	ulpfr	58	use strict;
23	ulpfr	54
24	laperla	73
25			=pod
26
27			Text from 2002-03-05 is structured with <div> tags as follows:
28
29			index.html:
30			<div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31			<div id="short_desc"> KURZE BESCHREIBUNG
32
33			desc.html:
34			<div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36			author.html:
37			<div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39			translator.html:
40			<div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42			=cut
43
44	ulpfr	58	my $debug = 0;
45	ulpfr	54
46	ulpfr	58	my $p = HTML::Parser->new(
47			api_version => 3,
48			start_h => [\&handle_start, "tagname, attr"],
49			end_h => [\&handle_end, "tagname"],
50			text_h => [\&handle_text, "dtext"],
51			marked_sections => 1,
52			);
53			my %result;
54			my $text;
55			my $open;
56	laperla	74	my $div;
57	ulpfr	58
58			sub handle_start {
59			my $tag = shift;
60	laperla	72	my $attr = shift;
61	ulpfr	58
62	laperla	74	return unless $tag eq "div";
63			$div = $attr->{id};
64	ulpfr	58	$open++;
65			print ">" x $open, $tag, "\n" if $debug;
66			}
67
68			sub handle_end {
69			my $tag = shift;
70
71	laperla	74	return unless $tag eq "div";
72	ulpfr	58	print "<" x $open, $tag, "\n" if $debug;
73			$open--;
74			$text =~ s/^\s+//;
75			$text =~ s/\s+$//;
76			$text =~ s/\s+/ /g;
77	laperla	74	$result{$div} .= $text . ' ';
78	ulpfr	58	$text = '';
79			}
80
81
82			sub handle_text {
83	laperla	72	my $c = shift;
84			$text .= $c if $open;
85	ulpfr	58	}
86
87	laperla	74
88	laperla	69	sub my_parse ($) {
89			my($s) = @_;
90			my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
91			# LATIN for entities
92			# and we would get
93			# mixed content in
94			# result
95	laperla	74	%result = ();
96			$text = '';
97			$open = 0;
98			$div = undef;
99	laperla	69	$p->parse($ls);
100			$p->eof;
101			}
102
103	laperla	74
104	ulpfr	54	sub split {
105			my ($self, $doc) = @_;
106	laperla	74	my %doc = map { $_ => "" } qw(isbn author aboutauthor
107			translator abouttranslator colophon
108			abstract title subtitle title_orig toc inx);
109	ulpfr	54
110	ulpfr	61	if ($doc->{author}) {
111	laperla	69	my_parse($doc->{author});
112	laperla	74	$doc{aboutauthor} = $result{author_bio};
113	ulpfr	61	}
114	laperla	74	if ($doc->{translator}) {
115			my_parse($doc->{translator});
116			$doc{abouttranslator} = $result{translator_bio};
117			}
118	ulpfr	61	if ($doc->{index}) {
119	laperla	72	my_parse($doc->{index});
120	laperla	74	$doc{abstract} = $result{short_desc};
121			$doc{isbn} = $result{isbn};
122			$doc{author} = $result{author_names};
123			$doc{translator} = $result{translator_names};
124			$doc{title} = $result{title};
125			$doc{subtitle} = $result{subtitle};
126			$doc{title_orig} = $result{title_orig};
127	ulpfr	61	}
128	laperla	65	if ($doc->{colophon}) {
129	laperla	69	my_parse($doc->{colophon});
130	laperla	74	$doc{colophon} = $result{colophon};
131	laperla	65	}
132	laperla	74	if ($doc->{toc}) {
133			my_parse($doc->{toc});
134			my $s = $result{book_toc};
135			$s =~ s/<BR>/ /ig;
136			$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
137			$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
138			$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
139			$doc{toc} = $s;
140			}
141			if ($doc->{inx}) {
142			my_parse($doc->{inx});
143			my $s = $result{book_inx};
144			$s =~ s/<BR>/ /ig;
145			$s =~ s/&#(8211);/-/g;
146			$s =~ s/&#(8220);/"/g;
147			$s =~ s/&#(8222);/"/g;
148			$s =~ s/&#(8217);/'/g;
149			$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
150			$s =~ s/\s*,\s+/ /g; # Komma
151			1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê)
152			1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
153			$s =~ s/(\w+)\( \)/$1()/g; # functions in the index
154			$doc{inx} = $s;
155			}
156			if ($doc->{desc}) {
157			my_parse($doc->{desc});
158			$doc{desc} = $result{long_desc};
159			}
160	ulpfr	54
161	laperla	69	while (my($k,$v) = each %doc) {
162	laperla	74	next unless defined($v) && length($v);
163	laperla	69	my $utf8v = Encode::decode("ISO-8859-1",$v);
164			$doc{$k} = $utf8v;
165			}
166	laperla	74	# warn "ALERT: No author" unless $doc{author};
167	laperla	69
168	ulpfr	58	return \%doc;
169	ulpfr	54	}
170	ulpfr	58
171			1;