--- cvs-head/lib/WAIT/Parse/Ora.pm	2002/03/05 13:40:38	73
+++ cvs-head/lib/WAIT/Parse/Ora.pm	2002/03/08 21:18:51	74
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 #                              -*- Mode: Perl -*- 
 # $Basename: HTML.pm $
-# $Revision: 1.7 $
+# $Revision: 1.8 $
 # Author          : Ulrich Pfeifer with Andreas König
 # Created On      : Sat Nov 1 1997
 # Last Modified By: Ulrich Pfeifer
@@ -15,11 +15,11 @@
 # 
 
 package WAIT::Parse::Ora;
+use base qw(WAIT::Parse::Base);
+
 use HTML::Parser;
 use Encode;
 use strict;
-use vars qw(@ISA);
-@ISA = qw(WAIT::Parse::Base);
 
 
 =pod
@@ -42,14 +42,6 @@
 =cut
 
 my $debug = 0;
-my %is_text = (
-               p     => 'text',
-               a     => 'text', # uebersetzer
-#            h1    => 'text',
-#            h2    => 'text',
-#            h3    => 'text',
-               title => 'title',
-           );
 
 my $p = HTML::Parser->new(
                           api_version => 3,
@@ -61,17 +53,14 @@
 my %result;
 my $text;
 my $open;
+my $div;
 
 sub handle_start {
   my $tag = shift;
   my $attr = shift;
 
-  return unless
-      $is_text{$tag}                 # well-formed paragraphs
-      ||
-          $tag eq "h3"               # good for desc, author, and colo
-      ||
-          ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
+  return unless $tag eq "div";
+  $div = $attr->{id};
   $open++;
   print ">" x $open, $tag,  "\n" if $debug;
 }
@@ -79,26 +68,23 @@
 sub handle_end {
   my $tag = shift;
 
-  return unless $is_text{$tag};
+  return unless $tag eq "div";
   print "<" x $open, $tag,  "\n" if $debug;
   $open--;
   $text =~ s/^\s+//;
   $text =~ s/\s+$//;
   $text =~ s/\s+/ /g;
-  $result{$is_text{$tag}} .= $text . ' ';
+  $result{$div} .= $text . ' ';
   $text = '';
 }
 
 
 sub handle_text {
   my $c = shift;
-  if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
-    $open--;
-    return;
-  }
   $text .= $c if $open;
 }
 
+
 sub my_parse ($) {
   my($s) = @_;
   my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
@@ -106,61 +92,78 @@
                                                 # and we would get
                                                 # mixed content in
                                                 # result
+  %result = ();
+  $text = '';
+  $open = 0;
+  $div = undef;
   $p->parse($ls);
   $p->eof;
 }
 
+
 sub split {
   my ($self, $doc) = @_;
-  my %doc = ( isbn => '',
-              author => '',
-              aboutauthor => '',
-              colophon => '',
-              abstract => ''
-            );
+  my %doc = map { $_ => "" } qw(isbn author aboutauthor
+                                translator abouttranslator colophon
+                                abstract title subtitle title_orig toc inx);
 
   if ($doc->{author}) {
-    %result = ();
-    $text = '';
-    $open = 0;
     my_parse($doc->{author});
-    $doc{author} = $result{title};
-    $doc{aboutauthor}  = $result{text};
+    $doc{aboutauthor}  = $result{author_bio};
+  }
+  if ($doc->{translator}) {
+    my_parse($doc->{translator});
+    $doc{abouttranslator}  = $result{translator_bio};
   }
   if ($doc->{index}) {
-    $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
-    %result = ();
-    $text = '';
-    $open = 0;
     my_parse($doc->{index});
-    $doc{abstract} = $result{text};
+    $doc{abstract} = $result{short_desc};
+    $doc{isbn} = $result{isbn};
+    $doc{author} = $result{author_names};
+    $doc{translator} = $result{translator_names};
+    $doc{title} = $result{title};
+    $doc{subtitle} = $result{subtitle};
+    $doc{title_orig} = $result{title_orig};
   }
   if ($doc->{colophon}) {
-    %result = ();
-    $text = '';
-    $open = 0;
     my_parse($doc->{colophon});
-    $doc{colophon} = $result{text};
+    $doc{colophon} = $result{colophon};
+  }
+  if ($doc->{toc}) {
+    my_parse($doc->{toc});
+    my $s = $result{book_toc};
+    $s =~ s/<BR>/ /ig;
+    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+    $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
+    $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
+    $doc{toc} = $s;
+  }
+  if ($doc->{inx}) {
+    my_parse($doc->{inx});
+    my $s = $result{book_inx};
+    $s =~ s/<BR>/ /ig;
+    $s =~ s/&#(8211);/-/g;
+    $s =~ s/&#(8220);/"/g;
+    $s =~ s/&#(8222);/"/g;
+    $s =~ s/&#(8217);/'/g;
+    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+    $s =~ s/\s*,\s+/ /g; # Komma
+    1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
+    1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
+    $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
+    $doc{inx} = $s;
+  }
+  if ($doc->{desc}) {
+    my_parse($doc->{desc});
+    $doc{desc}  = $result{long_desc};
   }
-  %result = ();
-  $text = '';
-  $open = 0;
-
-  my_parse($doc->{desc});
-
-  $doc{desc}  = $result{text};
-  $doc{title} = $result{title};
 
   while (my($k,$v) = each %doc) {
+    next unless defined($v) && length($v);
     my $utf8v = Encode::decode("ISO-8859-1",$v);
     $doc{$k} = $utf8v;
   }
-
-  $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
-  $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
-  $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
-  # warn "desc[$doc{desc}]";
-  # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
+  # warn "ALERT: No author" unless $doc{author};
 
   return \%doc;
 }