/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 75 by laperla, Thu Mar 14 17:27:22 2002 UTC revision 76 by laperla, Sat Apr 6 19:00:54 2002 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl  #!/usr/bin/perl
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
3  # $Basename: HTML.pm $  # $Basename: HTML.pm $
4  # $Revision: 1.9 $  # $Revision: 1.10 $
5  # Author          : Ulrich Pfeifer with Andreas König  # Author          : Ulrich Pfeifer with Andreas König
6  # Created On      : Sat Nov 1 1997  # Created On      : Sat Nov 1 1997
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
# Line 43  Text from 2002-03-05 is structured with Line 43  Text from 2002-03-05 is structured with
43    
44  my $debug = 0;  my $debug = 0;
45    
46  my $p = HTML::Parser->new(  my $globalp = HTML::Parser->new(
47                            api_version => 3,                                  api_version => 3,
48                            start_h => [\&handle_start, "tagname, attr"],                                  start_h => [\&handle_start, "tagname, attr"],
49                            end_h   => [\&handle_end,   "tagname"],                                  end_h   => [\&handle_end,   "tagname"],
50                            text_h  => [\&handle_text,  "dtext"],                                  text_h  => [\&handle_text,  "dtext"],
51                            marked_sections => 1,                                  marked_sections => 1,
52                           );                                 );
53  my %result;  my %result;
54  my $text;  my $text;
55  my $open;  my $open;
56  my $div;  my $div;
57    
58    sub initialize_text {
59      if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
60        $text = "\x{100}";
61      } else {
62        $text = "";
63      }
64    }
65    
66    sub finished_text {
67      if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
68        $text =~ s/^\x{100}//;
69      }
70      $text =~ s/^\s+//;
71      $text =~ s/\s+$//;
72      $text =~ s/\s+/ /g;
73      $text;
74    }
75    
76  sub handle_start {  sub handle_start {
77    my $tag = shift;    my $tag = shift;
78    my $attr = shift;    my $attr = shift;
79    
80    return unless $tag eq "div";    return unless $tag eq "div";
81    $div = $attr->{id};    $div = $attr->{id};
82      utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
83    $open++;    $open++;
84    print ">" x $open, $tag,  "\n" if $debug;    print ">" x $open, $tag,  "\n" if $debug;
85  }  }
# Line 72  sub handle_end { Line 91  sub handle_end {
91    print "<" x $open, $tag,  "\n" if $debug;    print "<" x $open, $tag,  "\n" if $debug;
92    $open--;    $open--;
93    return unless $div;    return unless $div;
   $text =~ s/^\s+//;  
   $text =~ s/\s+$//;  
   $text =~ s/\s+/ /g;  
94    if (defined $result{$div}){    if (defined $result{$div}){
95      $result{$div} .= " $text";      $result{$div} .= " " . finished_text();
96    } else {    } else {
97      $result{$div} = $text;      $result{$div} = finished_text();
98    }    }
99    $text = '';    initialize_text();
100  }  }
101    
102    
# Line 90  sub handle_text { Line 106  sub handle_text {
106  }  }
107    
108    
109    # WAIT::Parse::Ora::my_parse
110  sub my_parse ($) {  sub my_parse ($) {
111    my($s) = @_;    my($s) = @_;
112    my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns    my $ls;
113                                                  # LATIN for entities    if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
114                                                  # and we would get      warn "Warning: this HTML::Parser has Unicode support on"
115                                                  # mixed content in          if HTML::Entities::UNICODE_SUPPORT();
116                                                  # result      $ls = Encode::encode("ISO-8859-1", $s, 1);
117        # HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities.
118        # We reduce the amount of mixed-encoding content to just a few
119        # punctuation characters when we work with Latin here.
120    
121      } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
122        $ls = Encode::encode_utf8($s);
123      } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
124        $ls = $s;
125        $ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
126        utf8::downgrade($ls); # didn't improve the coredumpness
127        die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
128        # warn "ls[$ls]";
129      } else {
130        $ls = $s;
131      }
132    %result = ();    %result = ();
133    $text = '';    initialize_text();
134    $open = 0;    $open = 0;
135    $div = undef;    $div = undef;
136    $p->parse($ls);    if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
137    $p->eof;      # code that directly deals with $s because it doesn't want the
138  }      # conversion to $ls (latin1)
139    
140        $s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
141        my $pre_s = $s;
142        # $s =~ s/&#153;//g;
143        # $s =~ s/\302\255//g; # 2.8 pounds in
144        if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
145          # Should be handled by HTML::Entities, numeric entities and
146          # HTML::Entities and -DUNICODE_ENTITIES => core dump
147    
148          my $saidinfo = 0;
149          local $| = 1;
150          while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
151            print "Info:" unless $saidinfo++;
152            print " &#$1;";
153          }
154          print "\n" if $saidinfo;
155    
156        }
157      }
158      if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
159        my $p = HTML::Parser->new(
160                                  api_version => 3,
161                                  start_h => [\&handle_start, "tagname, attr"],
162                                  end_h   => [\&handle_end,   "tagname"],
163                                  text_h  => [\&handle_text,  "dtext"],
164                                  marked_sections => 1,
165                                 );
166        $p->parse($ls);
167        $p->eof;
168      } else {
169        $globalp->parse($ls);
170        $globalp->eof;
171      }
172      if (0) { # XXX
173        # code that tries to postprocess the nonsense resulting from the above
174        while (my($k,$v) = each %result) {
175          next unless defined($v) && length($v);
176          next if Encode::is_utf8($v);
177          next unless $v =~ /[^\040-\177]/;
178          # Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
179          my $utf8v;
180          if (HTML::Entities::UNICODE_SUPPORT()) {
181            if (0) {
182              # klappt nicht == 2002-04-02
183              $utf8v = Encode::decode("ISO-8859-1",$v);
184            } elsif (1) {
185              $utf8v = $v;
186              # fuehrt zu "unexpected downgraded strings" und die haben dann
187              # noch ein UTF-8 Teile, die nicht als solche markiert sind
188            }
189          } else {
190            # Want to find out which condition we need to watch
191            if ($HTML::Parser::VERSION != 3.26) {
192              # klappt nicht == 2002-04-02
193              $utf8v = Encode::decode("ISO-8859-1",$v);
194            } else {
195              # klappt nicht == 2002-04-02
196              $utf8v = $v;
197              Encode::_utf8_on($utf8v);
198              Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
199            }
200          }
201          $result{$k} = $utf8v;
202        }
203      }
204      if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
205        while (my($k,$v) = each %result) {
206          next unless defined($v) && length($v);
207          my $utf8v = Encode::decode("ISO-8859-1",$v);
208          $result{$k} = $utf8v;
209        }
210      } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
211        while (my($k,$v) = each %result) {
212          next unless defined($v) && length($v);
213          my $utf8v = Encode::decode_utf8($v);
214          $result{$k} = $utf8v;
215        }
216      } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
217        while (my($k,$v) = each %result) {
218          next unless defined($v) && length($v);
219          utf8::upgrade($v);
220          $result{$k} = $v;
221        }
222      }
223    }
224    
225  sub split {  sub split {
226    my ($self, $doc) = @_;    my ($self, $doc) = @_;
227    my %doc = map { $_ => "" } qw(isbn author aboutauthor    my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
228                                  translator abouttranslator colophon                                  translator abouttranslator colophon
229                                  abstract title subtitle title_orig toc inx);                                  abstract title subtitle title_orig toc inx);
230    
# Line 124  sub split { Line 240  sub split {
240      my_parse($doc->{index});      my_parse($doc->{index});
241      $doc{abstract} = $result{short_desc};      $doc{abstract} = $result{short_desc};
242      $doc{isbn} = $result{isbn};      $doc{isbn} = $result{isbn};
243      $doc{author} = $result{author_names};      $doc{author} = $result{author_names} || "";
244      $doc{translator} = $result{translator_names};      $doc{translator} = $result{translator_names};
245      $doc{title} = $result{title};      $doc{title} = $result{title};
246      $doc{subtitle} = $result{subtitle};      $doc{subtitle} = $result{subtitle};
247      $doc{title_orig} = $result{title_orig};      $doc{title_orig} = $result{title_orig};
248    }    }
249      if ($doc->{chapter}) {
250        my $content = $doc->{chapter};
251        my $bs;
252        $bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si;
253        my $es;
254        $es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si;
255        unless ($bs){
256          $content =~ s/^.*?<h1/<h1/si;
257        }
258        unless ($es){
259          $content =~ s/<HR.*//si;
260        }
261        $content =~ s/^/<div id="chapter">/;
262        $content .= "</div>\n";
263        my_parse($content);
264        $doc{chapter} = $result{chapter};
265      }
266    if ($doc->{colophon}) {    if ($doc->{colophon}) {
267      my_parse($doc->{colophon});      my_parse($doc->{colophon});
268      $doc{colophon} = $result{colophon};      my $s = $doc{colophon} = $result{colophon};
269        # use Devel::Peek;
270        # Devel::Peek::Dump($s);
271    }    }
272    if ($doc->{toc}) {    if ($doc->{toc}) {
273      my_parse($doc->{toc});      my_parse($doc->{toc});
274      my $s = $result{book_toc};      if (my $s = $result{book_toc}) {
275      $s =~ s/<BR>/ /ig;        # $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was &lt;BR&gt;
276      $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl        $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
277      $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff        $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
278      $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi        $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
279      $doc{toc} = $s;        $doc{toc} = $s;
280        } else {
281          die "toc[$doc->{toc}] not parseable?";
282        }
283    }    }
284    if ($doc->{inx}) {    if ($doc->{inx}) {
285      my_parse($doc->{inx});      my_parse($doc->{inx});
286      my $s = $result{book_inx} || "";      my $s = $result{book_inx} || "";
287      $s =~ s/<BR>/ /ig;      # $s =~ s/<BR>/ /ig; # wrong!, see above
288      $s =~ s/&#(8211);/-/g;      $s =~ s/&#(8211);/-/g;
289      $s =~ s/&#(8220);/"/g;      $s =~ s/&#(8220);/"/g;
290      $s =~ s/&#(8222);/"/g;      $s =~ s/&#(8222);/"/g;
# Line 163  sub split { Line 301  sub split {
301      $doc{desc}  = $result{long_desc};      $doc{desc}  = $result{long_desc};
302    }    }
303    
304    while (my($k,$v) = each %doc) {    if (0) {
305      next unless defined($v) && length($v);      # we did really convert the stuff we just read in to UTF8
306      my $utf8v = Encode::decode("ISO-8859-1",$v);      # (although WAIT::Document::Ora::conv_getline converts to UTF8
307      $doc{$k} = $utf8v;      # itself), because my_parse did the conversion back to latin1.
308        # This nonsense must stop. All routines must get and give UTF-8.
309        # If they want to process something else internally, they must
310        # convert twice
311        while (my($k,$v) = each %doc) {
312          next unless defined($v) && length($v);
313          my $utf8v = Encode::decode("ISO-8859-1",$v);
314          $doc{$k} = $utf8v;
315        }
316    }    }
317    # warn "ALERT: No author" unless $doc{author};    # warn "ALERT: No author" unless $doc{author};
318    

Legend:
Removed from v.75  
changed lines
  Added in v.76

  ViewVC Help
Powered by ViewVC 1.1.26