--- cvs-head/lib/WAIT/Parse/Ora.pm 2002/03/05 13:40:38 73
+++ cvs-head/lib/WAIT/Parse/Ora.pm 2002/03/08 21:18:51 74
@@ -1,7 +1,7 @@
#!/usr/bin/perl
# -*- Mode: Perl -*-
# $Basename: HTML.pm $
-# $Revision: 1.7 $
+# $Revision: 1.8 $
# Author : Ulrich Pfeifer with Andreas König
# Created On : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
@@ -15,11 +15,11 @@
#
package WAIT::Parse::Ora;
+use base qw(WAIT::Parse::Base);
+
use HTML::Parser;
use Encode;
use strict;
-use vars qw(@ISA);
-@ISA = qw(WAIT::Parse::Base);
=pod
@@ -42,14 +42,6 @@
=cut
my $debug = 0;
-my %is_text = (
- p => 'text',
- a => 'text', # uebersetzer
-# h1 => 'text',
-# h2 => 'text',
-# h3 => 'text',
- title => 'title',
- );
my $p = HTML::Parser->new(
api_version => 3,
@@ -61,17 +53,14 @@
my %result;
my $text;
my $open;
+my $div;
sub handle_start {
my $tag = shift;
my $attr = shift;
- return unless
- $is_text{$tag} # well-formed paragraphs
- ||
- $tag eq "h3" # good for desc, author, and colo
- ||
- ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
+ return unless $tag eq "div";
+ $div = $attr->{id};
$open++;
print ">" x $open, $tag, "\n" if $debug;
}
@@ -79,26 +68,23 @@
sub handle_end {
my $tag = shift;
- return unless $is_text{$tag};
+ return unless $tag eq "div";
print "<" x $open, $tag, "\n" if $debug;
$open--;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
- $result{$is_text{$tag}} .= $text . ' ';
+ $result{$div} .= $text . ' ';
$text = '';
}
sub handle_text {
my $c = shift;
- if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
- $open--;
- return;
- }
$text .= $c if $open;
}
+
sub my_parse ($) {
my($s) = @_;
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
@@ -106,61 +92,78 @@
# and we would get
# mixed content in
# result
+ %result = ();
+ $text = '';
+ $open = 0;
+ $div = undef;
$p->parse($ls);
$p->eof;
}
+
sub split {
my ($self, $doc) = @_;
- my %doc = ( isbn => '',
- author => '',
- aboutauthor => '',
- colophon => '',
- abstract => ''
- );
+ my %doc = map { $_ => "" } qw(isbn author aboutauthor
+ translator abouttranslator colophon
+ abstract title subtitle title_orig toc inx);
if ($doc->{author}) {
- %result = ();
- $text = '';
- $open = 0;
my_parse($doc->{author});
- $doc{author} = $result{title};
- $doc{aboutauthor} = $result{text};
+ $doc{aboutauthor} = $result{author_bio};
+ }
+ if ($doc->{translator}) {
+ my_parse($doc->{translator});
+ $doc{abouttranslator} = $result{translator_bio};
}
if ($doc->{index}) {
- $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
- %result = ();
- $text = '';
- $open = 0;
my_parse($doc->{index});
- $doc{abstract} = $result{text};
+ $doc{abstract} = $result{short_desc};
+ $doc{isbn} = $result{isbn};
+ $doc{author} = $result{author_names};
+ $doc{translator} = $result{translator_names};
+ $doc{title} = $result{title};
+ $doc{subtitle} = $result{subtitle};
+ $doc{title_orig} = $result{title_orig};
}
if ($doc->{colophon}) {
- %result = ();
- $text = '';
- $open = 0;
my_parse($doc->{colophon});
- $doc{colophon} = $result{text};
+ $doc{colophon} = $result{colophon};
+ }
+ if ($doc->{toc}) {
+ my_parse($doc->{toc});
+ my $s = $result{book_toc};
+ $s =~ s/
/ /ig;
+ $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+ $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
+ $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
+ $doc{toc} = $s;
+ }
+ if ($doc->{inx}) {
+ my_parse($doc->{inx});
+ my $s = $result{book_inx};
+ $s =~ s/
/ /ig;
+ $s =~ s/(8211);/-/g;
+ $s =~ s/(8220);/"/g;
+ $s =~ s/(8222);/"/g;
+ $s =~ s/(8217);/'/g;
+ $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+ $s =~ s/\s*,\s+/ /g; # Komma
+ 1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê)
+ 1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
+ $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
+ $doc{inx} = $s;
+ }
+ if ($doc->{desc}) {
+ my_parse($doc->{desc});
+ $doc{desc} = $result{long_desc};
}
- %result = ();
- $text = '';
- $open = 0;
-
- my_parse($doc->{desc});
-
- $doc{desc} = $result{text};
- $doc{title} = $result{title};
while (my($k,$v) = each %doc) {
+ next unless defined($v) && length($v);
my $utf8v = Encode::decode("ISO-8859-1",$v);
$doc{$k} = $utf8v;
}
-
- $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
- $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
- $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
- # warn "desc[$doc{desc}]";
- # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
+ # warn "ALERT: No author" unless $doc{author};
return \%doc;
}