1 |
#!/usr/bin/perl |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.7 $ |
# $Revision: 1.8 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
15 |
# |
# |
16 |
|
|
17 |
package WAIT::Parse::Ora; |
package WAIT::Parse::Ora; |
18 |
|
use base qw(WAIT::Parse::Base); |
19 |
|
|
20 |
use HTML::Parser; |
use HTML::Parser; |
21 |
use Encode; |
use Encode; |
22 |
use strict; |
use strict; |
|
use vars qw(@ISA); |
|
|
@ISA = qw(WAIT::Parse::Base); |
|
23 |
|
|
24 |
|
|
25 |
=pod |
=pod |
42 |
=cut |
=cut |
43 |
|
|
44 |
my $debug = 0; |
my $debug = 0; |
|
my %is_text = ( |
|
|
p => 'text', |
|
|
a => 'text', # uebersetzer |
|
|
# h1 => 'text', |
|
|
# h2 => 'text', |
|
|
# h3 => 'text', |
|
|
title => 'title', |
|
|
); |
|
45 |
|
|
46 |
my $p = HTML::Parser->new( |
my $p = HTML::Parser->new( |
47 |
api_version => 3, |
api_version => 3, |
53 |
my %result; |
my %result; |
54 |
my $text; |
my $text; |
55 |
my $open; |
my $open; |
56 |
|
my $div; |
57 |
|
|
58 |
sub handle_start { |
sub handle_start { |
59 |
my $tag = shift; |
my $tag = shift; |
60 |
my $attr = shift; |
my $attr = shift; |
61 |
|
|
62 |
return unless |
return unless $tag eq "div"; |
63 |
$is_text{$tag} # well-formed paragraphs |
$div = $attr->{id}; |
|
|| |
|
|
$tag eq "h3" # good for desc, author, and colo |
|
|
|| |
|
|
($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html |
|
64 |
$open++; |
$open++; |
65 |
print ">" x $open, $tag, "\n" if $debug; |
print ">" x $open, $tag, "\n" if $debug; |
66 |
} |
} |
68 |
sub handle_end { |
sub handle_end { |
69 |
my $tag = shift; |
my $tag = shift; |
70 |
|
|
71 |
return unless $is_text{$tag}; |
return unless $tag eq "div"; |
72 |
print "<" x $open, $tag, "\n" if $debug; |
print "<" x $open, $tag, "\n" if $debug; |
73 |
$open--; |
$open--; |
74 |
$text =~ s/^\s+//; |
$text =~ s/^\s+//; |
75 |
$text =~ s/\s+$//; |
$text =~ s/\s+$//; |
76 |
$text =~ s/\s+/ /g; |
$text =~ s/\s+/ /g; |
77 |
$result{$is_text{$tag}} .= $text . ' '; |
$result{$div} .= $text . ' '; |
78 |
$text = ''; |
$text = ''; |
79 |
} |
} |
80 |
|
|
81 |
|
|
82 |
sub handle_text { |
sub handle_text { |
83 |
my $c = shift; |
my $c = shift; |
|
if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) { |
|
|
$open--; |
|
|
return; |
|
|
} |
|
84 |
$text .= $c if $open; |
$text .= $c if $open; |
85 |
} |
} |
86 |
|
|
87 |
|
|
88 |
sub my_parse ($) { |
sub my_parse ($) { |
89 |
my($s) = @_; |
my($s) = @_; |
90 |
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns |
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns |
92 |
# and we would get |
# and we would get |
93 |
# mixed content in |
# mixed content in |
94 |
# result |
# result |
95 |
|
%result = (); |
96 |
|
$text = ''; |
97 |
|
$open = 0; |
98 |
|
$div = undef; |
99 |
$p->parse($ls); |
$p->parse($ls); |
100 |
$p->eof; |
$p->eof; |
101 |
} |
} |
102 |
|
|
103 |
|
|
104 |
sub split { |
sub split { |
105 |
my ($self, $doc) = @_; |
my ($self, $doc) = @_; |
106 |
my %doc = ( isbn => '', |
my %doc = map { $_ => "" } qw(isbn author aboutauthor |
107 |
author => '', |
translator abouttranslator colophon |
108 |
aboutauthor => '', |
abstract title subtitle title_orig toc inx); |
|
colophon => '', |
|
|
abstract => '' |
|
|
); |
|
109 |
|
|
110 |
if ($doc->{author}) { |
if ($doc->{author}) { |
|
%result = (); |
|
|
$text = ''; |
|
|
$open = 0; |
|
111 |
my_parse($doc->{author}); |
my_parse($doc->{author}); |
112 |
$doc{author} = $result{title}; |
$doc{aboutauthor} = $result{author_bio}; |
113 |
$doc{aboutauthor} = $result{text}; |
} |
114 |
|
if ($doc->{translator}) { |
115 |
|
my_parse($doc->{translator}); |
116 |
|
$doc{abouttranslator} = $result{translator_bio}; |
117 |
} |
} |
118 |
if ($doc->{index}) { |
if ($doc->{index}) { |
|
$doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1; |
|
|
%result = (); |
|
|
$text = ''; |
|
|
$open = 0; |
|
119 |
my_parse($doc->{index}); |
my_parse($doc->{index}); |
120 |
$doc{abstract} = $result{text}; |
$doc{abstract} = $result{short_desc}; |
121 |
|
$doc{isbn} = $result{isbn}; |
122 |
|
$doc{author} = $result{author_names}; |
123 |
|
$doc{translator} = $result{translator_names}; |
124 |
|
$doc{title} = $result{title}; |
125 |
|
$doc{subtitle} = $result{subtitle}; |
126 |
|
$doc{title_orig} = $result{title_orig}; |
127 |
} |
} |
128 |
if ($doc->{colophon}) { |
if ($doc->{colophon}) { |
|
%result = (); |
|
|
$text = ''; |
|
|
$open = 0; |
|
129 |
my_parse($doc->{colophon}); |
my_parse($doc->{colophon}); |
130 |
$doc{colophon} = $result{text}; |
$doc{colophon} = $result{colophon}; |
131 |
|
} |
132 |
|
if ($doc->{toc}) { |
133 |
|
my_parse($doc->{toc}); |
134 |
|
my $s = $result{book_toc}; |
135 |
|
$s =~ s/<BR>/ /ig; |
136 |
|
$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl |
137 |
|
$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff |
138 |
|
$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi |
139 |
|
$doc{toc} = $s; |
140 |
|
} |
141 |
|
if ($doc->{inx}) { |
142 |
|
my_parse($doc->{inx}); |
143 |
|
my $s = $result{book_inx}; |
144 |
|
$s =~ s/<BR>/ /ig; |
145 |
|
$s =~ s/&#(8211);/-/g; |
146 |
|
$s =~ s/&#(8220);/"/g; |
147 |
|
$s =~ s/&#(8222);/"/g; |
148 |
|
$s =~ s/&#(8217);/'/g; |
149 |
|
$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl |
150 |
|
$s =~ s/\s*,\s+/ /g; # Komma |
151 |
|
1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê) |
152 |
|
1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben |
153 |
|
$s =~ s/(\w+)\( \)/$1()/g; # functions in the index |
154 |
|
$doc{inx} = $s; |
155 |
|
} |
156 |
|
if ($doc->{desc}) { |
157 |
|
my_parse($doc->{desc}); |
158 |
|
$doc{desc} = $result{long_desc}; |
159 |
} |
} |
|
%result = (); |
|
|
$text = ''; |
|
|
$open = 0; |
|
|
|
|
|
my_parse($doc->{desc}); |
|
|
|
|
|
$doc{desc} = $result{text}; |
|
|
$doc{title} = $result{title}; |
|
160 |
|
|
161 |
while (my($k,$v) = each %doc) { |
while (my($k,$v) = each %doc) { |
162 |
|
next unless defined($v) && length($v); |
163 |
my $utf8v = Encode::decode("ISO-8859-1",$v); |
my $utf8v = Encode::decode("ISO-8859-1",$v); |
164 |
$doc{$k} = $utf8v; |
$doc{$k} = $utf8v; |
165 |
} |
} |
166 |
|
# warn "ALERT: No author" unless $doc{author}; |
|
$doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//; |
|
|
$doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /; |
|
|
$doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /; |
|
|
# warn "desc[$doc{desc}]"; |
|
|
# warn "abstract[$doc{abstract}]"; # zu viel, zu viel! |
|
167 |
|
|
168 |
return \%doc; |
return \%doc; |
169 |
} |
} |