/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory | Revision Log | View Patch Patch

-revision 73 by laperla,
Tue Mar  5 13:40:38 2002 UTC
+revision 74 by laperla,
Fri Mar  8 21:18:51 2002 UTC
 Line 1
  #!/usr/bin/perl
  #                              -*- Mode: Perl -*-
  # $Basename: HTML.pm $
- # $Revision: 1.7 $
+ # $Revision: 1.8 $
  # Author          : Ulrich Pfeifer with Andreas König
  # Created On      : Sat Nov 1 1997
  # Last Modified By: Ulrich Pfeifer
 Line 15
  #
  package WAIT::Parse::Ora;
+ use base qw(WAIT::Parse::Base);
  use HTML::Parser;
  use Encode;
  use strict;
- use vars qw(@ISA);
- @ISA = qw(WAIT::Parse::Base);
  =pod
 Line 42 
 Text from 2002-03-05 is structured with
  =cut
  my $debug = 0;
- my %is_text = (
-                p     => 'text',
-                a     => 'text', # uebersetzer
- #            h1    => 'text',
- #            h2    => 'text',
- #            h3    => 'text',
-                title => 'title',
-            );
  my $p = HTML::Parser->new(
                            api_version => 3,
-Line 61 
 my $p = HTML::Parser->new(
+Line 53 
 my $p = HTML::Parser->new(
  my %result;
  my $text;
  my $open;
+ my $div;
  sub handle_start {
    my $tag = shift;
    my $attr = shift;
-   return unless
+   return unless $tag eq "div";
-       $is_text{$tag}                 # well-formed paragraphs
+   $div = $attr->{id};
-       ||
-           $tag eq "h3"               # good for desc, author, and colo
-       ||
-           ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
    $open++;
    print ">" x $open, $tag,  "\n" if $debug;
  }
-Line 79 
 sub handle_start {
+Line 68 
 sub handle_start {
  sub handle_end {
    my $tag = shift;
-   return unless $is_text{$tag};
+   return unless $tag eq "div";
    print "<" x $open, $tag,  "\n" if $debug;
    $open--;
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
-   $result{$is_text{$tag}} .= $text . ' ';
+   $result{$div} .= $text . ' ';
    $text = '';
  }
  sub handle_text {
    my $c = shift;
-   if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
-     $open--;
-     return;
-   }
    $text .= $c if $open;
  }
  sub my_parse ($) {
    my($s) = @_;
    my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
-Line 106 
 sub my_parse ($) {
+Line 92 
 sub my_parse ($) {
                                                  # and we would get
                                                  # mixed content in
                                                  # result
+   %result = ();
+   $text = '';
+   $open = 0;
+   $div = undef;
    $p->parse($ls);
    $p->eof;
  }
  sub split {
    my ($self, $doc) = @_;
-   my %doc = ( isbn => '',
+   my %doc = map { $_ => "" } qw(isbn author aboutauthor
-               author => '',
+                                 translator abouttranslator colophon
-               aboutauthor => '',
+                                 abstract title subtitle title_orig toc inx);
-               colophon => '',
-               abstract => ''
-             );
    if ($doc->{author}) {
-     %result = ();
-     $text = '';
-     $open = 0;
      my_parse($doc->{author});
-     $doc{author} = $result{title};
+     $doc{aboutauthor}  = $result{author_bio};
-     $doc{aboutauthor}  = $result{text};
+   }
+   if ($doc->{translator}) {
+     my_parse($doc->{translator});
+     $doc{abouttranslator}  = $result{translator_bio};
    }
    if ($doc->{index}) {
-     $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
-     %result = ();
-     $text = '';
-     $open = 0;
      my_parse($doc->{index});
-     $doc{abstract} = $result{text};
+     $doc{abstract} = $result{short_desc};
+     $doc{isbn} = $result{isbn};
+     $doc{author} = $result{author_names};
+     $doc{translator} = $result{translator_names};
+     $doc{title} = $result{title};
+     $doc{subtitle} = $result{subtitle};
+     $doc{title_orig} = $result{title_orig};
    }
    if ($doc->{colophon}) {
-     %result = ();
-     $text = '';
-     $open = 0;
      my_parse($doc->{colophon});
-     $doc{colophon} = $result{text};
+     $doc{colophon} = $result{colophon};
+   }
+   if ($doc->{toc}) {
+     my_parse($doc->{toc});
+     my $s = $result{book_toc};
+     $s =~ s/<BR>/ /ig;
+     $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+     $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
+     $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
+     $doc{toc} = $s;
+   }
+   if ($doc->{inx}) {
+     my_parse($doc->{inx});
+     my $s = $result{book_inx};
+     $s =~ s/<BR>/ /ig;
+     $s =~ s/&#(8211);/-/g;
+     $s =~ s/&#(8220);/"/g;
+     $s =~ s/&#(8222);/"/g;
+     $s =~ s/&#(8217);/'/g;
+     $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+     $s =~ s/\s*,\s+/ /g; # Komma
+while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
+while $s =~ s/\s\d+\s/ /g; # Seitenangaben
+     $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
+     $doc{inx} = $s;
+   }
+   if ($doc->{desc}) {
+     my_parse($doc->{desc});
+     $doc{desc}  = $result{long_desc};
    }
-   %result = ();
-   $text = '';
-   $open = 0;
-   my_parse($doc->{desc});
-   $doc{desc}  = $result{text};
-   $doc{title} = $result{title};
    while (my($k,$v) = each %doc) {
+     next unless defined($v) && length($v);
      my $utf8v = Encode::decode("ISO-8859-1",$v);
      $doc{$k} = $utf8v;
    }
+   # warn "ALERT: No author" unless $doc{author};
-   $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
-   $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
-   $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
-   # warn "desc[$doc{desc}]";
-   # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
    return \%doc;
  }

 Legend:



Removed from v.73
 


changed lines


 
Added in v.74
 Legend:



Removed from v.73
 


changed lines


 
Added in v.74
-Removed from v.73
+Added in v.74

	ViewVC Help
Powered by ViewVC 1.1.26