/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory | Revision Log | View Patch Patch

-revision 75 by laperla,
Thu Mar 14 17:27:22 2002 UTC
+revision 76 by laperla,
Sat Apr  6 19:00:54 2002 UTC
 Line 1
  #!/usr/bin/perl
  #                              -*- Mode: Perl -*-
  # $Basename: HTML.pm $
- # $Revision: 1.9 $
+ # $Revision: 1.10 $
  # Author          : Ulrich Pfeifer with Andreas König
  # Created On      : Sat Nov 1 1997
  # Last Modified By: Ulrich Pfeifer
 Line 43 
 Text from 2002-03-05 is structured with
  my $debug = 0;
- my $p = HTML::Parser->new(
+ my $globalp = HTML::Parser->new(
-                           api_version => 3,
+                                 api_version => 3,
-                           start_h => [\&handle_start, "tagname, attr"],
+                                 start_h => [\&handle_start, "tagname, attr"],
-                           end_h   => [\&handle_end,   "tagname"],
+                                 end_h   => [\&handle_end,   "tagname"],
-                           text_h  => [\&handle_text,  "dtext"],
+                                 text_h  => [\&handle_text,  "dtext"],
-                           marked_sections => 1,
+                                 marked_sections => 1,
-                          );
+                                );
  my %result;
  my $text;
  my $open;
  my $div;
+ sub initialize_text {
+   if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
+     $text = "\x{100}";
+   } else {
+     $text = "";
+   }
+ }
+ sub finished_text {
+   if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
+     $text =~ s/^\x{100}//;
+   }
+   $text =~ s/^\s+//;
+   $text =~ s/\s+$//;
+   $text =~ s/\s+/ /g;
+   $text;
+ }
  sub handle_start {
    my $tag = shift;
    my $attr = shift;
    return unless $tag eq "div";
    $div = $attr->{id};
+   utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
    $open++;
    print ">" x $open, $tag,  "\n" if $debug;
  }
-Line 72 
 sub handle_end {
+Line 91 
 sub handle_end {
    print "<" x $open, $tag,  "\n" if $debug;
    $open--;
    return unless $div;
-   $text =~ s/^\s+//;
-   $text =~ s/\s+$//;
-   $text =~ s/\s+/ /g;
    if (defined $result{$div}){
-     $result{$div} .= " $text";
+     $result{$div} .= " " . finished_text();
    } else {
-     $result{$div} = $text;
+     $result{$div} = finished_text();
    }
-   $text = '';
+   initialize_text();
  }
-Line 90 
 sub handle_text {
+Line 106 
 sub handle_text {
  }
+ # WAIT::Parse::Ora::my_parse
  sub my_parse ($) {
    my($s) = @_;
-   my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
+   my $ls;
-                                                 # LATIN for entities
+   if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
-                                                 # and we would get
+     warn "Warning: this HTML::Parser has Unicode support on"
-                                                 # mixed content in
+         if HTML::Entities::UNICODE_SUPPORT();
-                                                 # result
+     $ls = Encode::encode("ISO-8859-1", $s, 1);
+     # HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities.
+     # We reduce the amount of mixed-encoding content to just a few
+     # punctuation characters when we work with Latin here.
+   } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
+     $ls = Encode::encode_utf8($s);
+   } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
+     $ls = $s;
+     $ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
+     utf8::downgrade($ls); # didn't improve the coredumpness
+     die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
+     # warn "ls[$ls]";
+   } else {
+     $ls = $s;
+   }
    %result = ();
-   $text = '';
+   initialize_text();
    $open = 0;
    $div = undef;
-   $p->parse($ls);
+   if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
-   $p->eof;
+     # code that directly deals with $s because it doesn't want the
- }
+     # conversion to $ls (latin1)
+     $s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
+     my $pre_s = $s;
+     # $s =~ s/&#153;//g;
+     # $s =~ s/\302\255//g; # 2.8 pounds in
+     if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
+       # Should be handled by HTML::Entities, numeric entities and
+       # HTML::Entities and -DUNICODE_ENTITIES => core dump
+       my $saidinfo = 0;
+       local $| = 1;
+       while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
+         print "Info:" unless $saidinfo++;
+         print " &#$1;";
+       }
+       print "\n" if $saidinfo;
+     }
+   }
+   if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
+     my $p = HTML::Parser->new(
+                               api_version => 3,
+                               start_h => [\&handle_start, "tagname, attr"],
+                               end_h   => [\&handle_end,   "tagname"],
+                               text_h  => [\&handle_text,  "dtext"],
+                               marked_sections => 1,
+                              );
+     $p->parse($ls);
+     $p->eof;
+   } else {
+     $globalp->parse($ls);
+     $globalp->eof;
+   }
+   if (0) { # XXX
+     # code that tries to postprocess the nonsense resulting from the above
+     while (my($k,$v) = each %result) {
+       next unless defined($v) && length($v);
+       next if Encode::is_utf8($v);
+       next unless $v =~ /[^\040-\177]/;
+       # Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
+       my $utf8v;
+       if (HTML::Entities::UNICODE_SUPPORT()) {
+         if (0) {
+           # klappt nicht == 2002-04-02
+           $utf8v = Encode::decode("ISO-8859-1",$v);
+         } elsif (1) {
+           $utf8v = $v;
+           # fuehrt zu "unexpected downgraded strings" und die haben dann
+           # noch ein UTF-8 Teile, die nicht als solche markiert sind
+         }
+       } else {
+         # Want to find out which condition we need to watch
+         if ($HTML::Parser::VERSION != 3.26) {
+           # klappt nicht == 2002-04-02
+           $utf8v = Encode::decode("ISO-8859-1",$v);
+         } else {
+           # klappt nicht == 2002-04-02
+           $utf8v = $v;
+           Encode::_utf8_on($utf8v);
+           Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
+         }
+       }
+       $result{$k} = $utf8v;
+     }
+   }
+   if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
+     while (my($k,$v) = each %result) {
+       next unless defined($v) && length($v);
+       my $utf8v = Encode::decode("ISO-8859-1",$v);
+       $result{$k} = $utf8v;
+     }
+   } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
+     while (my($k,$v) = each %result) {
+       next unless defined($v) && length($v);
+       my $utf8v = Encode::decode_utf8($v);
+       $result{$k} = $utf8v;
+     }
+   } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
+     while (my($k,$v) = each %result) {
+       next unless defined($v) && length($v);
+       utf8::upgrade($v);
+       $result{$k} = $v;
+     }
+   }
+ }
  sub split {
    my ($self, $doc) = @_;
-   my %doc = map { $_ => "" } qw(isbn author aboutauthor
+   my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
                                  translator abouttranslator colophon
                                  abstract title subtitle title_orig toc inx);
-Line 124 
 sub split {
+Line 240 
 sub split {
      my_parse($doc->{index});
      $doc{abstract} = $result{short_desc};
      $doc{isbn} = $result{isbn};
-     $doc{author} = $result{author_names};
+     $doc{author} = $result{author_names} || "";
      $doc{translator} = $result{translator_names};
      $doc{title} = $result{title};
      $doc{subtitle} = $result{subtitle};
      $doc{title_orig} = $result{title_orig};
    }
+   if ($doc->{chapter}) {
+     my $content = $doc->{chapter};
+     my $bs;
+     $bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si;
+     my $es;
+     $es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si;
+     unless ($bs){
+       $content =~ s/^.*?<h1/<h1/si;
+     }
+     unless ($es){
+       $content =~ s/<HR.*//si;
+     }
+     $content =~ s/^/<div id="chapter">/;
+     $content .= "</div>\n";
+     my_parse($content);
+     $doc{chapter} = $result{chapter};
+   }
    if ($doc->{colophon}) {
      my_parse($doc->{colophon});
-     $doc{colophon} = $result{colophon};
+     my $s = $doc{colophon} = $result{colophon};
+     # use Devel::Peek;
+     # Devel::Peek::Dump($s);
    }
    if ($doc->{toc}) {
      my_parse($doc->{toc});
-     my $s = $result{book_toc};
+     if (my $s = $result{book_toc}) {
-     $s =~ s/<BR>/ /ig;
+       # $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was &lt;BR&gt;
-     $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
+       $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
-     $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
+       $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
-     $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
+       $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
-     $doc{toc} = $s;
+       $doc{toc} = $s;
+     } else {
+       die "toc[$doc->{toc}] not parseable?";
+     }
    }
    if ($doc->{inx}) {
      my_parse($doc->{inx});
      my $s = $result{book_inx} || "";
-     $s =~ s/<BR>/ /ig;
+     # $s =~ s/<BR>/ /ig; # wrong!, see above
      $s =~ s/&#(8211);/-/g;
      $s =~ s/&#(8220);/"/g;
      $s =~ s/&#(8222);/"/g;
-Line 163 
 sub split {
+Line 301 
 sub split {
      $doc{desc}  = $result{long_desc};
    }
-   while (my($k,$v) = each %doc) {
+   if (0) {
-     next unless defined($v) && length($v);
+     # we did really convert the stuff we just read in to UTF8
-     my $utf8v = Encode::decode("ISO-8859-1",$v);
+     # (although WAIT::Document::Ora::conv_getline converts to UTF8
-     $doc{$k} = $utf8v;
+     # itself), because my_parse did the conversion back to latin1.
+     # This nonsense must stop. All routines must get and give UTF-8.
+     # If they want to process something else internally, they must
+     # convert twice
+     while (my($k,$v) = each %doc) {
+       next unless defined($v) && length($v);
+       my $utf8v = Encode::decode("ISO-8859-1",$v);
+       $doc{$k} = $utf8v;
+     }
    }
    # warn "ALERT: No author" unless $doc{author};

 Legend:



Removed from v.75
 


changed lines


 
Added in v.76
 Legend:



Removed from v.75
 


changed lines


 
Added in v.76
-Removed from v.75
+Added in v.76

	ViewVC Help
Powered by ViewVC 1.1.26