--- cvs-head/lib/WAIT/Parse/Ora.pm 2002/03/05 13:40:38 73 +++ cvs-head/lib/WAIT/Parse/Ora.pm 2002/03/08 21:18:51 74 @@ -1,7 +1,7 @@ #!/usr/bin/perl # -*- Mode: Perl -*- # $Basename: HTML.pm $ -# $Revision: 1.7 $ +# $Revision: 1.8 $ # Author : Ulrich Pfeifer with Andreas König # Created On : Sat Nov 1 1997 # Last Modified By: Ulrich Pfeifer @@ -15,11 +15,11 @@ # package WAIT::Parse::Ora; +use base qw(WAIT::Parse::Base); + use HTML::Parser; use Encode; use strict; -use vars qw(@ISA); -@ISA = qw(WAIT::Parse::Base); =pod @@ -42,14 +42,6 @@ =cut my $debug = 0; -my %is_text = ( - p => 'text', - a => 'text', # uebersetzer -# h1 => 'text', -# h2 => 'text', -# h3 => 'text', - title => 'title', - ); my $p = HTML::Parser->new( api_version => 3, @@ -61,17 +53,14 @@ my %result; my $text; my $open; +my $div; sub handle_start { my $tag = shift; my $attr = shift; - return unless - $is_text{$tag} # well-formed paragraphs - || - $tag eq "h3" # good for desc, author, and colo - || - ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html + return unless $tag eq "div"; + $div = $attr->{id}; $open++; print ">" x $open, $tag, "\n" if $debug; } @@ -79,26 +68,23 @@ sub handle_end { my $tag = shift; - return unless $is_text{$tag}; + return unless $tag eq "div"; print "<" x $open, $tag, "\n" if $debug; $open--; $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; - $result{$is_text{$tag}} .= $text . ' '; + $result{$div} .= $text . ' '; $text = ''; } sub handle_text { my $c = shift; - if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) { - $open--; - return; - } $text .= $c if $open; } + sub my_parse ($) { my($s) = @_; my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns @@ -106,61 +92,78 @@ # and we would get # mixed content in # result + %result = (); + $text = ''; + $open = 0; + $div = undef; $p->parse($ls); $p->eof; } + sub split { my ($self, $doc) = @_; - my %doc = ( isbn => '', - author => '', - aboutauthor => '', - colophon => '', - abstract => '' - ); + my %doc = map { $_ => "" } qw(isbn author aboutauthor + translator abouttranslator colophon + abstract title subtitle title_orig toc inx); if ($doc->{author}) { - %result = (); - $text = ''; - $open = 0; my_parse($doc->{author}); - $doc{author} = $result{title}; - $doc{aboutauthor} = $result{text}; + $doc{aboutauthor} = $result{author_bio}; + } + if ($doc->{translator}) { + my_parse($doc->{translator}); + $doc{abouttranslator} = $result{translator_bio}; } if ($doc->{index}) { - $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1; - %result = (); - $text = ''; - $open = 0; my_parse($doc->{index}); - $doc{abstract} = $result{text}; + $doc{abstract} = $result{short_desc}; + $doc{isbn} = $result{isbn}; + $doc{author} = $result{author_names}; + $doc{translator} = $result{translator_names}; + $doc{title} = $result{title}; + $doc{subtitle} = $result{subtitle}; + $doc{title_orig} = $result{title_orig}; } if ($doc->{colophon}) { - %result = (); - $text = ''; - $open = 0; my_parse($doc->{colophon}); - $doc{colophon} = $result{text}; + $doc{colophon} = $result{colophon}; + } + if ($doc->{toc}) { + my_parse($doc->{toc}); + my $s = $result{book_toc}; + $s =~ s/
/ /ig; + $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl + $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff + $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi + $doc{toc} = $s; + } + if ($doc->{inx}) { + my_parse($doc->{inx}); + my $s = $result{book_inx}; + $s =~ s/
/ /ig; + $s =~ s/&#(8211);/-/g; + $s =~ s/&#(8220);/"/g; + $s =~ s/&#(8222);/"/g; + $s =~ s/&#(8217);/'/g; + $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl + $s =~ s/\s*,\s+/ /g; # Komma + 1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê) + 1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben + $s =~ s/(\w+)\( \)/$1()/g; # functions in the index + $doc{inx} = $s; + } + if ($doc->{desc}) { + my_parse($doc->{desc}); + $doc{desc} = $result{long_desc}; } - %result = (); - $text = ''; - $open = 0; - - my_parse($doc->{desc}); - - $doc{desc} = $result{text}; - $doc{title} = $result{title}; while (my($k,$v) = each %doc) { + next unless defined($v) && length($v); my $utf8v = Encode::decode("ISO-8859-1",$v); $doc{$k} = $utf8v; } - - $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//; - $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /; - $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /; - # warn "desc[$doc{desc}]"; - # warn "abstract[$doc{abstract}]"; # zu viel, zu viel! + # warn "ALERT: No author" unless $doc{author}; return \%doc; }