/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (show annotations)
Fri Mar 8 21:18:51 2002 UTC (22 years, 1 month ago) by laperla
File size: 4342 byte(s)
- much better markup in the docs makes parsing so much easier and more
  reliable.

- New documents added: inx and toc.

- Output of index_ora more helpful and additional option of setting
  $traceALL that allows us to debug what the parser passes on to WAIT.

1 #!/usr/bin/perl
2 # -*- Mode: Perl -*-
3 # $Basename: HTML.pm $
4 # $Revision: 1.8 $
5 # Author : Ulrich Pfeifer with Andreas König
6 # Created On : Sat Nov 1 1997
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 16:06:14 2002
9 # Language : CPerl
10 # Update Count : 14
11 # Status : Unknown, Use with caution!
12 #
13 # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14 #
15 #
16
17 package WAIT::Parse::Ora;
18 use base qw(WAIT::Parse::Base);
19
20 use HTML::Parser;
21 use Encode;
22 use strict;
23
24
25 =pod
26
27 Text from 2002-03-05 is structured with <div> tags as follows:
28
29 index.html:
30 <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31 <div id="short_desc"> KURZE BESCHREIBUNG
32
33 desc.html:
34 <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36 author.html:
37 <div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39 translator.html:
40 <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42 =cut
43
44 my $debug = 0;
45
46 my $p = HTML::Parser->new(
47 api_version => 3,
48 start_h => [\&handle_start, "tagname, attr"],
49 end_h => [\&handle_end, "tagname"],
50 text_h => [\&handle_text, "dtext"],
51 marked_sections => 1,
52 );
53 my %result;
54 my $text;
55 my $open;
56 my $div;
57
58 sub handle_start {
59 my $tag = shift;
60 my $attr = shift;
61
62 return unless $tag eq "div";
63 $div = $attr->{id};
64 $open++;
65 print ">" x $open, $tag, "\n" if $debug;
66 }
67
68 sub handle_end {
69 my $tag = shift;
70
71 return unless $tag eq "div";
72 print "<" x $open, $tag, "\n" if $debug;
73 $open--;
74 $text =~ s/^\s+//;
75 $text =~ s/\s+$//;
76 $text =~ s/\s+/ /g;
77 $result{$div} .= $text . ' ';
78 $text = '';
79 }
80
81
82 sub handle_text {
83 my $c = shift;
84 $text .= $c if $open;
85 }
86
87
88 sub my_parse ($) {
89 my($s) = @_;
90 my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
91 # LATIN for entities
92 # and we would get
93 # mixed content in
94 # result
95 %result = ();
96 $text = '';
97 $open = 0;
98 $div = undef;
99 $p->parse($ls);
100 $p->eof;
101 }
102
103
104 sub split {
105 my ($self, $doc) = @_;
106 my %doc = map { $_ => "" } qw(isbn author aboutauthor
107 translator abouttranslator colophon
108 abstract title subtitle title_orig toc inx);
109
110 if ($doc->{author}) {
111 my_parse($doc->{author});
112 $doc{aboutauthor} = $result{author_bio};
113 }
114 if ($doc->{translator}) {
115 my_parse($doc->{translator});
116 $doc{abouttranslator} = $result{translator_bio};
117 }
118 if ($doc->{index}) {
119 my_parse($doc->{index});
120 $doc{abstract} = $result{short_desc};
121 $doc{isbn} = $result{isbn};
122 $doc{author} = $result{author_names};
123 $doc{translator} = $result{translator_names};
124 $doc{title} = $result{title};
125 $doc{subtitle} = $result{subtitle};
126 $doc{title_orig} = $result{title_orig};
127 }
128 if ($doc->{colophon}) {
129 my_parse($doc->{colophon});
130 $doc{colophon} = $result{colophon};
131 }
132 if ($doc->{toc}) {
133 my_parse($doc->{toc});
134 my $s = $result{book_toc};
135 $s =~ s/<BR>/ /ig;
136 $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
137 $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
138 $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
139 $doc{toc} = $s;
140 }
141 if ($doc->{inx}) {
142 my_parse($doc->{inx});
143 my $s = $result{book_inx};
144 $s =~ s/<BR>/ /ig;
145 $s =~ s/&#(8211);/-/g;
146 $s =~ s/&#(8220);/"/g;
147 $s =~ s/&#(8222);/"/g;
148 $s =~ s/&#(8217);/'/g;
149 $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
150 $s =~ s/\s*,\s+/ /g; # Komma
151 1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
152 1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
153 $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
154 $doc{inx} = $s;
155 }
156 if ($doc->{desc}) {
157 my_parse($doc->{desc});
158 $doc{desc} = $result{long_desc};
159 }
160
161 while (my($k,$v) = each %doc) {
162 next unless defined($v) && length($v);
163 my $utf8v = Encode::decode("ISO-8859-1",$v);
164 $doc{$k} = $utf8v;
165 }
166 # warn "ALERT: No author" unless $doc{author};
167
168 return \%doc;
169 }
170
171 1;

Properties

Name Value
cvs2svn:cvs-rev 1.8

  ViewVC Help
Powered by ViewVC 1.1.26