/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (hide annotations)
Fri Mar 8 21:18:51 2002 UTC (22 years, 3 months ago) by laperla
File size: 4342 byte(s)
- much better markup in the docs makes parsing so much easier and more
  reliable.

- New documents added: inx and toc.

- Output of index_ora more helpful and additional option of setting
  $traceALL that allows us to debug what the parser passes on to WAIT.

1 ulpfr 54 #!/usr/bin/perl
2     # -*- Mode: Perl -*-
3     # $Basename: HTML.pm $
4 laperla 74 # $Revision: 1.8 $
5 ulpfr 54 # Author : Ulrich Pfeifer with Andreas König
6     # Created On : Sat Nov 1 1997
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 61 # Last Modified On: Fri Jan 4 16:06:14 2002
9 ulpfr 54 # Language : CPerl
10 ulpfr 61 # Update Count : 14
11 ulpfr 54 # Status : Unknown, Use with caution!
12     #
13     # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14     #
15     #
16    
17     package WAIT::Parse::Ora;
18 laperla 74 use base qw(WAIT::Parse::Base);
19    
20 ulpfr 58 use HTML::Parser;
21 laperla 69 use Encode;
22 ulpfr 58 use strict;
23 ulpfr 54
24 laperla 73
25     =pod
26    
27     Text from 2002-03-05 is structured with <div> tags as follows:
28    
29     index.html:
30     <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31     <div id="short_desc"> KURZE BESCHREIBUNG
32    
33     desc.html:
34     <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35    
36     author.html:
37     <div id="author_bio"> BIOGRAPHIE DES AUTOREN
38    
39     translator.html:
40     <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41    
42     =cut
43    
44 ulpfr 58 my $debug = 0;
45 ulpfr 54
46 ulpfr 58 my $p = HTML::Parser->new(
47     api_version => 3,
48     start_h => [\&handle_start, "tagname, attr"],
49     end_h => [\&handle_end, "tagname"],
50     text_h => [\&handle_text, "dtext"],
51     marked_sections => 1,
52     );
53     my %result;
54     my $text;
55     my $open;
56 laperla 74 my $div;
57 ulpfr 58
58     sub handle_start {
59     my $tag = shift;
60 laperla 72 my $attr = shift;
61 ulpfr 58
62 laperla 74 return unless $tag eq "div";
63     $div = $attr->{id};
64 ulpfr 58 $open++;
65     print ">" x $open, $tag, "\n" if $debug;
66     }
67    
68     sub handle_end {
69     my $tag = shift;
70    
71 laperla 74 return unless $tag eq "div";
72 ulpfr 58 print "<" x $open, $tag, "\n" if $debug;
73     $open--;
74     $text =~ s/^\s+//;
75     $text =~ s/\s+$//;
76     $text =~ s/\s+/ /g;
77 laperla 74 $result{$div} .= $text . ' ';
78 ulpfr 58 $text = '';
79     }
80    
81    
82     sub handle_text {
83 laperla 72 my $c = shift;
84     $text .= $c if $open;
85 ulpfr 58 }
86    
87 laperla 74
88 laperla 69 sub my_parse ($) {
89     my($s) = @_;
90     my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
91     # LATIN for entities
92     # and we would get
93     # mixed content in
94     # result
95 laperla 74 %result = ();
96     $text = '';
97     $open = 0;
98     $div = undef;
99 laperla 69 $p->parse($ls);
100     $p->eof;
101     }
102    
103 laperla 74
104 ulpfr 54 sub split {
105     my ($self, $doc) = @_;
106 laperla 74 my %doc = map { $_ => "" } qw(isbn author aboutauthor
107     translator abouttranslator colophon
108     abstract title subtitle title_orig toc inx);
109 ulpfr 54
110 ulpfr 61 if ($doc->{author}) {
111 laperla 69 my_parse($doc->{author});
112 laperla 74 $doc{aboutauthor} = $result{author_bio};
113 ulpfr 61 }
114 laperla 74 if ($doc->{translator}) {
115     my_parse($doc->{translator});
116     $doc{abouttranslator} = $result{translator_bio};
117     }
118 ulpfr 61 if ($doc->{index}) {
119 laperla 72 my_parse($doc->{index});
120 laperla 74 $doc{abstract} = $result{short_desc};
121     $doc{isbn} = $result{isbn};
122     $doc{author} = $result{author_names};
123     $doc{translator} = $result{translator_names};
124     $doc{title} = $result{title};
125     $doc{subtitle} = $result{subtitle};
126     $doc{title_orig} = $result{title_orig};
127 ulpfr 61 }
128 laperla 65 if ($doc->{colophon}) {
129 laperla 69 my_parse($doc->{colophon});
130 laperla 74 $doc{colophon} = $result{colophon};
131 laperla 65 }
132 laperla 74 if ($doc->{toc}) {
133     my_parse($doc->{toc});
134     my $s = $result{book_toc};
135     $s =~ s/<BR>/ /ig;
136     $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
137     $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einf\x{fc}hrung 1.1 Zugriff
138     $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
139     $doc{toc} = $s;
140     }
141     if ($doc->{inx}) {
142     my_parse($doc->{inx});
143     my $s = $result{book_inx};
144     $s =~ s/<BR>/ /ig;
145     $s =~ s/&#(8211);/-/g;
146     $s =~ s/&#(8220);/"/g;
147     $s =~ s/&#(8222);/"/g;
148     $s =~ s/&#(8217);/'/g;
149     $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
150     $s =~ s/\s*,\s+/ /g; # Komma
151     1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
152     1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
153     $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
154     $doc{inx} = $s;
155     }
156     if ($doc->{desc}) {
157     my_parse($doc->{desc});
158     $doc{desc} = $result{long_desc};
159     }
160 ulpfr 54
161 laperla 69 while (my($k,$v) = each %doc) {
162 laperla 74 next unless defined($v) && length($v);
163 laperla 69 my $utf8v = Encode::decode("ISO-8859-1",$v);
164     $doc{$k} = $utf8v;
165     }
166 laperla 74 # warn "ALERT: No author" unless $doc{author};
167 laperla 69
168 ulpfr 58 return \%doc;
169 ulpfr 54 }
170 ulpfr 58
171     1;

Properties

Name Value
cvs2svn:cvs-rev 1.8

  ViewVC Help
Powered by ViewVC 1.1.26